diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 06928c94a1f0be17a03101d15f8418dd0aafdd9b..a574449d3bb81a73566dd2cfaae935b7c991d9c9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -144,7 +144,7 @@ model_tests:
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - >
       python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer || exit 1;
       python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
       python tools/converter.py benchmark --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=file --model_data_format=file || exit 1;
       python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file || exit 1;
@@ -195,7 +195,8 @@ extra_tests:
         GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
-    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
+    - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
+    - python tools/bazel_adb_run.py --target="//mace/port:port_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
 
 so_size_check:
   stage: so_size_check
diff --git a/BUILD b/BUILD.bazel
similarity index 100%
rename from BUILD
rename to BUILD.bazel
diff --git a/docs/development/how_to_debug.rst b/docs/development/how_to_debug.rst
index ea4688585562d812d06cdfa4a27935f3252df66a..1f516d28301b73fbe44ecd2cdaa9e1dd0aa7393e 100644
--- a/docs/development/how_to_debug.rst
+++ b/docs/development/how_to_debug.rst
@@ -101,17 +101,20 @@ MACE also provides model visualization HTML generated in `builds` directory, gen
 
 Debug engine using log
 --------------------------
-Mace defines two sorts of logs: one is for users (LOG), the other is for developers (VLOG).
+MACE implements a similar logging mechanism like `glog <https://github.com/google/glog>`__.
+There are two types of logs, LOG for normal logging and VLOG for debugging.
 
-LOG includes four levels, i.e, ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``;
-Environment variable ``MACE_CPP_MIN_LOG_LEVEL`` can be set to specify log level of users, e.g.,
-``set MACE_CPP_MIN_LOG_LEVEL=0`` will enable ``INFO`` log level, while ``set MACE_CPP_MIN_LOG_LEVEL=4`` will enable ``FATAL`` log level.
+LOG includes four levels, sorted by severity level: ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``.
+The logging severity threshold can be configured via environment variable, e.g. ``MACE_CPP_MIN_LOG_LEVEL=WARNING`` to set as ``WARNING``.
+Only the log messages with equal or above the specified severity threshold will be printed, the default threshold is ``INFO``.
+We don't support integer log severity value like `glog <https://github.com/google/glog>`__, because they are confusing with VLOG.
 
+VLOG is verbose logging which is logged as ``LOG(INFO)``. VLOG also has more detailed integer verbose levels, like 0, 1, 2, 3, etc.
+The threshold can be configured through environment variable, e.g. ``MACE_CPP_MIN_VLOG_LEVEL=2`` to set as ``2``.
+With VLOG, the lower the verbose level, the more likely messages are to be logged. For example, when the threshold is set
+to 2, both ``VLOG(1)``, ``VLOG(2)`` log messages will be printed, but ``VLOG(3)`` and highers won't. 
 
-VLOG level is specified by numbers, e.g., 0, 1, 2. Environment variable ``MACE_CPP_MIN_VLOG_LEVEL`` can be set to specify vlog level.
-Logs with higher levels than which is specified will be printed. So simply specifying a very large level number will make all logs printed.
-
-By using Mace run tool, vlog level can be easily set by option, e.g.,
+By using ``mace_run`` tool, VLOG level can be easily set by option, e.g.,
 
 	.. code:: sh
 
@@ -168,9 +171,3 @@ things may be a little bit complicated.
 		# then you can use it as host gdb, e.g.,
 		bt
 
-
-
-
-
-
-
diff --git a/docs/installation/env_requirement.rst b/docs/installation/env_requirement.rst
index be15c67c0917d59caea47836225ba67143098bf9..4a599ec523e31413cf2bd7c169782bba488760d3 100644
--- a/docs/installation/env_requirement.rst
+++ b/docs/installation/env_requirement.rst
@@ -41,7 +41,7 @@ For Bazel, install it following installation guide. For python dependencies,
 
 	.. code:: sh
 
-		pip install -U --user setup/requirements.txt
+		pip install -U --user -r setup/requirements.txt
 
 
 
@@ -83,7 +83,7 @@ For python dependencies,
 
 	.. code:: sh
 
-		pip install -U --user setup/optionals.txt
+		pip install -U --user -r setup/optionals.txt
 
 
 .. note::
diff --git a/docs/installation/using_docker.rst b/docs/installation/using_docker.rst
index 61e929a33b5f14bf58a1fb74fb13e0598919cd3c..b0c5ac4e5781cb1857eb42f2a8f0fdf268fcb29a 100644
--- a/docs/installation/using_docker.rst
+++ b/docs/installation/using_docker.rst
@@ -15,18 +15,18 @@ In most cases, the ``lite edition`` image can satisfy developer's basic needs.
 
 .. code:: sh
 
-    # Pull lite edition docker image
+    # You can pull lite edition docker image from docker repo (recommended)
     docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite
-    # Build lite edition docker image
+    # Or build lite edition docker image by yourself
     docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite ./docker/mace-dev-lite
 
 - ``full edition`` docker image (which contains multiple NDK versions and other dev tools).
 
 .. code:: sh
 
-    # Pull full edition docker image
+    # You can pull full edition docker image from docker repo (recommended)
     docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev
-    # Build full edition docker image
+    # Or build full edition docker image by yourself
     docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev ./docker/mace-dev
 
 .. note::
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 58c0f9352df91652220f6c45c0e5a76a504d4d80..dfd69cca91ef8ac90f35d1aa3dc6a4a9d8f832ac 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -81,7 +81,7 @@ in one deployment file.
     * - backend
       - The onnx backend framework for validation, could be [tensorflow, caffe2, pytorch], default is tensorflow.
     * - runtime
-      - The running device, one of [cpu, gpu, dsp, cpu_gpu]. cpu_gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
+      - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
     * - data_type
       - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
     * - input_data_types
@@ -421,11 +421,6 @@ the detailed information is in :doc:`benchmark`.
           - 3
           - ``run``/``benchmark``
           - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
-        * - --gpu_perf_hint
-          - int
-          - 3
-          - ``run``/``benchmark``
-          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
         * - --gpu_priority_hint
           - int
           - 3
diff --git a/mace/BUILD b/mace/BUILD.bazel
similarity index 78%
rename from mace/BUILD
rename to mace/BUILD.bazel
index 4b7da51fccfac614fe845bdd95e58d960c62ed75..ef1c338d0838c12ef2c44035e6b8104baf1d6361 100644
--- a/mace/BUILD
+++ b/mace/BUILD.bazel
@@ -6,6 +6,22 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux",
+    define_values = {
+        "linux": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "darwin",
+    define_values = {
+        "darwin": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "android_armv7",
     values = {
@@ -62,6 +78,17 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "hta_enabled",
+    define_values = {
+        "hta": "true",
+    },
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "openmp_enabled",
     define_values = {
diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD.bazel
similarity index 100%
rename from mace/benchmark/BUILD
rename to mace/benchmark/BUILD.bazel
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 4bd44ada514baf095cdb4bdfb6520808a285172c..adb267f3c8bb5361e5b4f929d3888b37b1c014f2 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -21,9 +21,11 @@
 #include <thread>  // NOLINT(build/c++11)
 
 #include "gflags/gflags.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 #include "mace/benchmark/statistics.h"
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -31,24 +33,6 @@
 
 namespace mace {
 namespace benchmark {
-namespace str_util {
-
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-
-}  //  namespace str_util
 
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   std::string tmp = str;
@@ -90,6 +74,18 @@ DeviceType ParseDeviceType(const std::string &device_str) {
   }
 }
 
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
+
 bool RunInference(MaceEngine *engine,
                   const std::map<std::string, mace::MaceTensor> &input_infos,
                   std::map<std::string, mace::MaceTensor> *output_infos,
@@ -168,6 +164,12 @@ DEFINE_string(output_node, "output_node0,output_node1",
               "output nodes, separated by comma");
 DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
 DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "max number of runs");
 DEFINE_double(max_seconds, 10.0, "max number of seconds to run");
@@ -213,14 +215,10 @@ int Main(int argc, char **argv) {
 
   std::unique_ptr<OpStat> statistician(new OpStat());
 
-  std::vector<std::string> input_names =
-      str_util::Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names =
-      str_util::Split(FLAGS_output_node, ',');
-  std::vector<std::string> input_shapes =
-      str_util::Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes =
-      str_util::Split(FLAGS_output_shape, ':');
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
 
   const size_t input_count = input_shapes.size();
   const size_t output_count = output_shapes.size();
@@ -233,6 +231,19 @@ int Main(int argc, char **argv) {
     ParseShape(output_shapes[i], &output_shape_vec[i]);
   }
 
+  std::vector<std::string> raw_input_data_formats =
+      Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+
   mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
 
   // configuration
@@ -273,41 +284,46 @@ int Main(int argc, char **argv) {
   std::shared_ptr<mace::MaceEngine> engine;
   MaceStatus create_engine_status;
   // Create Engine
-  std::vector<unsigned char> model_graph_data;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data;
   if (FLAGS_model_file != "") {
-    if (!mace::ReadBinaryFile(&model_graph_data, FLAGS_model_file)) {
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
+        &model_graph_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
       LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
     }
   }
 
-  const unsigned char *model_weights_data = nullptr;
-  size_t model_weights_data_size = 0;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data;
   if (FLAGS_model_data_file != "") {
-    MemoryMap(FLAGS_model_data_file,
-              &model_weights_data,
-              &model_weights_data_size);
-    MACE_CHECK(model_weights_data != nullptr && model_weights_data_size != 0);
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(
+        FLAGS_model_data_file.c_str(),
+        &model_weights_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
+    }
+    MACE_CHECK(model_weights_data->length() > 0);
   }
 
 #ifdef MODEL_GRAPH_FORMAT_CODE
-  create_engine_status =
-        CreateMaceEngineFromCode(FLAGS_model_name,
-                                 model_weights_data,
-                                 model_weights_data_size,
-                                 input_names,
-                                 output_names,
-                                 config,
-                                 &engine);
+  create_engine_status = CreateMaceEngineFromCode(FLAGS_model_name,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
+      model_weights_data->length(),
+      input_names,
+      output_names,
+      config,
+      &engine);
 #else
-  create_engine_status =
-      CreateMaceEngineFromProto(model_graph_data.data(),
-                                model_graph_data.size(),
-                                model_weights_data,
-                                model_weights_data_size,
-                                input_names,
-                                output_names,
-                                config,
-                                &engine);
+  create_engine_status = CreateMaceEngineFromProto(
+      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
+      model_graph_data->length(),
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
+      model_weights_data->length(),
+      input_names,
+      output_names,
+      config,
+      &engine);
 #endif
   if (create_engine_status != MaceStatus::MACE_SUCCESS) {
     LOG(FATAL) << "Create engine error, please check the arguments";
@@ -333,7 +349,8 @@ int Main(int argc, char **argv) {
       LOG(INFO) << "Open input file failed";
       return -1;
     }
-    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in,
+        input_data_formats[i]);
   }
 
   for (size_t i = 0; i < output_count; ++i) {
@@ -344,7 +361,8 @@ int Main(int argc, char **argv) {
     auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                              std::default_delete<float[]>());
     outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                buffer_out);
+                                                buffer_out,
+                                                output_data_formats[i]);
   }
 
   int64_t warmup_time_us = 0;
@@ -380,10 +398,6 @@ int Main(int argc, char **argv) {
 
   statistician->PrintStat();
 
-  if (model_weights_data != nullptr) {
-    MemoryUnMap(model_weights_data, model_weights_data_size);
-  }
-
   return 0;
 }
 
diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc
index 66b178cf7178919adf57d064f2aa21ccee0dc491..cdc4639155cdab36f45eb038907e7ac71e069f2e 100644
--- a/mace/benchmark/model_throughput_test.cc
+++ b/mace/benchmark/model_throughput_test.cc
@@ -23,8 +23,7 @@
  *          --dsp_model_data_file=dsp_model_data.data \
  *          --run_seconds=10
  */
-#include <malloc.h>
-#include <stdint.h>
+#include <cstdint>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
@@ -33,7 +32,7 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
 
diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD.bazel
similarity index 100%
rename from mace/codegen/BUILD
rename to mace/codegen/BUILD.bazel
diff --git a/mace/core/BUILD b/mace/core/BUILD.bazel
similarity index 73%
rename from mace/core/BUILD
rename to mace/core/BUILD.bazel
index 2e37524ffd8a77e400ba2924cd656586744b3af3..91df4f0f1d0d0a66b2903575a4373b26897628cb 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD.bazel
@@ -10,11 +10,14 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
+    "if_android_armv7",
     "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
+    "if_hta_enabled",
+    "if_hexagon_or_hta_enabled",
     "if_neon_enabled",
+    "if_not_hexagon_enabled",
     "if_opencl_enabled",
+    "if_openmp_enabled",
     "if_quantize_enabled",
 )
 
@@ -32,17 +35,24 @@ cc_library(
         [
             "runtime/opencl/*.cc",
         ],
-    )) + if_hexagon_enabled(glob([
-        "runtime/hexagon/*.cc",
-    ])),
+    )) + if_hexagon_enabled([
+        "runtime/hexagon/hexagon_dsp_wrapper.cc",
+    ]) + if_hta_enabled([
+        "runtime/hexagon/hexagon_hta_wrapper.cc",
+    ]),
     hdrs = glob([
         "*.h",
         "runtime/cpu/*.h",
-    ]) + if_opencl_enabled(glob(
-        [
-            "runtime/opencl/*.h",
-        ],
-    )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])),
+    ]) + if_opencl_enabled(glob([
+        "runtime/opencl/*.h",
+    ])) + if_hexagon_or_hta_enabled(glob([
+        "runtime/hexagon/hexagon_control_wrapper.h",
+        "runtime/hexagon/hexagon_device.h",
+    ])) + if_hexagon_enabled(glob([
+        "runtime/hexagon/*dsp*.h",
+    ])) + if_hta_enabled(glob([
+        "runtime/hexagon/*hta*.h",
+    ])),
     copts = [
         "-Werror",
         "-Wextra",
@@ -56,17 +66,20 @@ cc_library(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]) + if_neon_enabled([
         "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
     ]),
-    linkopts = ["-ldl"] + if_android([
-        "-pie",
-        "-lm",
-    ]),
+    linkopts = ["-ldl"],
     deps = [
         "//mace/codegen:generated_version",
         "//mace/proto:mace_cc",
         "//mace/utils",
+        "//mace/port",
         "@half//:half",
     ] + if_opencl_enabled([
         ":opencl_headers",
@@ -75,6 +88,8 @@ cc_library(
         "@gemmlowp",
     ]) + if_hexagon_enabled([
         "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
     ]),
 )
 
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 9c9103635245921ca2c354702b8ec9b062c40f37..c7499b92b51053436e61edabf4c93069c93f7a81 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -15,14 +15,13 @@
 #ifndef MACE_CORE_ALLOCATOR_H_
 #define MACE_CORE_ALLOCATOR_H_
 
-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
 #include <map>
 #include <limits>
 #include <vector>
 #include <cstring>
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
 #include "mace/public/mace.h"
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index 66684db150f459f877ac6b9a893b9027f9644548..d1f5f1a507ffde8f884b81096ea19b7ffd60ba73 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -21,8 +21,9 @@
 #include <functional>
 
 #include "mace/core/allocator.h"
-#include "mace/core/macros.h"
 #include "mace/core/types.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/macros.h"
 
 namespace mace {
 namespace core {
@@ -434,16 +435,11 @@ class BufferSlice : public BufferBase {
   }
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
-    MACE_UNUSED(offset);
-    MACE_UNUSED(length);
-    MACE_UNUSED(pitch);
-    MACE_NOT_IMPLEMENTED;
-    return nullptr;
+    return buffer_->Map(offset_ + offset, length, pitch);
   }
 
   void UnMap(void *mapped_ptr) const {
-    MACE_UNUSED(mapped_ptr);
-    MACE_NOT_IMPLEMENTED;
+    buffer_->UnMap(mapped_ptr);
   }
 
   void Map(std::vector<size_t> *pitch) {
@@ -507,7 +503,7 @@ class ScratchBuffer: public Buffer {
   virtual ~ScratchBuffer() {}
 
   MaceStatus GrowSize(const index_t size) {
-    if (size > size_) {
+    if (offset_ + size > size_) {
       VLOG(1) << "Grow scratch size to: " << size;
       MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
       return Resize(size);
diff --git a/mace/core/device.cc b/mace/core/device.cc
index 177b443ba25c729c54a49f4d77cc09cfac952879..535b7193633cf6881fea54f129c0485ddc3ed585 100644
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -15,16 +15,17 @@
 #include "mace/core/device.h"
 
 #include "mace/core/buffer.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 
 CPUDevice::CPUDevice(const int num_threads,
                      const CPUAffinityPolicy policy,
                      const bool use_gemmlowp)
-    : cpu_runtime_(new CPURuntime(num_threads,
-                                  policy,
-                                  use_gemmlowp)),
-      scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
+    : cpu_runtime_(make_unique<CPURuntime>(num_threads,
+                                           policy,
+                                           use_gemmlowp)),
+      scratch_buffer_(make_unique<ScratchBuffer>(GetCPUAllocator())) {}
 
 CPUDevice::~CPUDevice() = default;
 
diff --git a/mace/core/future.h b/mace/core/future.h
index 13382e1bf84575f2b0e5e63b0d881d720fc0e5d9..c7227d4df6ade05b1a6d392de0dbfa4772dff39d 100644
--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -20,11 +20,10 @@
 #include <vector>
 
 #include "mace/utils/logging.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 
-struct CallStats;
-
 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
   std::function<void(CallStats *)> wait_fn = [](CallStats *stats) {
diff --git a/mace/core/kv_storage.cc b/mace/core/kv_storage.cc
index 5eba8567171bba82b6f2e9d2bef094b5614490e8..e2feb8c827b5939098eb0b7d3a451b1ad62b44a6 100644
--- a/mace/core/kv_storage.cc
+++ b/mace/core/kv_storage.cc
@@ -13,18 +13,18 @@
 // limitations under the License.
 
 #include <fcntl.h>
-#include <limits.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <climits>
 #include <algorithm>
 #include <cstring>
 #include <memory>
 #include <utility>
 
 #include "mace/core/kv_storage.h"
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc
index 756e9321cf3a93559737e8b5e3c897462e3a5488..004fb1a927ae9a15ad733ebcf61918c4983f99e0 100644
--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -21,8 +21,9 @@
 #include <unordered_set>
 
 #include "mace/core/arg_helper.h"
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
+#include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -61,12 +62,22 @@ void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
 }
 
 MemoryBlock MemoryOptimizer::CreateMemoryBlock(
-    std::vector<int64_t> shape,
+    const OperatorDef *op_def,
+    int output_idx,
     DataType dt,
-    mace::MemoryType mem_type) {
+    MemoryType mem_type) {
+  auto shape = std::vector<int64_t>(
+      op_def->output_shape(output_idx).dims().begin(),
+      op_def->output_shape(output_idx).dims().end());
   MemoryBlock block;
 #ifdef MACE_ENABLE_OPENCL
   if (mem_type == MemoryType::GPU_IMAGE) {
+    OpenCLBufferType buffer_type = OpenCLBufferType::IN_OUT_CHANNEL;
+    if (op_def->type() == "BufferTransform") {
+      buffer_type = static_cast<OpenCLBufferType>(
+          ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+              *op_def, "buffer_type", OpenCLBufferType::IN_OUT_CHANNEL));
+    }
     std::vector<size_t> image_shape;
     if (shape.size() == 1) {
       shape = {shape[0], 1, 1, 1};
@@ -75,9 +86,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock(
     } else {
       MACE_CHECK(shape.size() == 4) << "GPU only support 1D/2D/4D input";
     }
-    OpenCLUtil::CalImage2DShape(shape,
-                                OpenCLBufferType::IN_OUT_CHANNEL,
-                                &image_shape);
+    OpenCLUtil::CalImage2DShape(shape, buffer_type, &image_shape);
     block.set_x(image_shape[0]);
     block.set_y(image_shape[1]);
     return block;
@@ -95,7 +104,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock(
 
 void MemoryOptimizer::Optimize(
     const mace::OperatorDef *op_def,
-    const std::unordered_map<std::string, MemoryType> &mem_types) {
+    const std::unordered_map<std::string, MemoryType> *mem_types) {
   MACE_LATENCY_LOGGER(2, "Optimize memory");
   if (op_def->output_size() != op_def->output_shape_size()) {
     VLOG(1) << op_def->name()
@@ -117,6 +126,8 @@ void MemoryOptimizer::Optimize(
       op_def->output_type_size());
   DataType dt;
 
+  bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+      *op_def, "has_data_format", 0) != 0;
   int output_size = op_def->output_size();
   for (int i = 0; i < output_size; ++i) {
     if (i < op_def->output_type_size()) {
@@ -127,22 +138,15 @@ void MemoryOptimizer::Optimize(
     int best_mem_id = -1;
     MemoryType mem_type = MemoryType::CPU_BUFFER;
     if (device == DeviceType::GPU) {
-      mem_type = mem_types.at(op_def->output(i));
+      mem_type = mem_types->at(op_def->output(i));
     }
-    auto shape = std::vector<int64_t>(
-        op_def->output_shape(i).dims().begin(),
-        op_def->output_shape(i).dims().end());
-    MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
+    MemoryBlock op_mem_block = CreateMemoryBlock(op_def, i, dt, mem_type);
     MemoryBlock best_mem_block;
     if (IsMemoryReuseOp(op_def->type())) {
       if (tensor_mem_map_.count(op_def->input(0)) == 1) {
-        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+        best_mem_id = tensor_mem_map_.at(op_def->input(0)).mem_id;
       }
     } else {
-      auto shape = std::vector<int64_t>(
-          op_def->output_shape(i).dims().begin(),
-          op_def->output_shape(i).dims().end());
-
       int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
       int64_t best_added_mem_size = LLONG_MAX;
       int64_t best_wasted_mem_size = LLONG_MAX;
@@ -206,7 +210,8 @@ void MemoryOptimizer::Optimize(
       } else {
         mem_ref_count_[best_mem_id] = 1;
       }
-      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+      tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
+          dt, has_data_format));
     }
   }
 
@@ -218,7 +223,7 @@ void MemoryOptimizer::Optimize(
       tensor_ref_count_[input_name] -= 1;
       if (tensor_ref_count_.at(input_name) == 0 &&
           tensor_mem_map_.count(input_name) == 1) {
-        int mem_id = tensor_mem_map_.at(input_name).first;
+        int mem_id = tensor_mem_map_.at(input_name).mem_id;
         mem_ref_count_[mem_id] -= 1;
         if (mem_ref_count_.at(mem_id) == 0) {
           idle_blocks_.insert(mem_id);
@@ -238,7 +243,7 @@ const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
   return mem_blocks_;
 }
 
-const std::unordered_map<std::string, std::pair<int, DataType>>&
+const std::unordered_map<std::string, MemoryOptimizer::TensorMemInfo>&
     MemoryOptimizer::tensor_mem_map() const {
   return tensor_mem_map_;
 }
diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h
index 555613e6a2043a47289bab0d8a44c282097bafc8..986c5450280184990b426b18d99b886ee6f8fcac 100644
--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
@@ -77,31 +77,44 @@ class MemoryBlock {
 };
 
 class MemoryOptimizer {
+ public:
+  struct TensorMemInfo {
+    int mem_id;
+    DataType data_type;
+    bool has_data_format;
+
+    TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
+        mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
+    {}
+  };
+
  public:
   static bool IsMemoryReuseOp(const std::string &op_type);
   void UpdateTensorRef(const std::string &tensor_name);
   void UpdateTensorRef(const OperatorDef *op_def);
-  void Optimize(const OperatorDef *op_def,
-                const std::unordered_map<std::string, MemoryType> &mem_types);
+  void Optimize(
+      const OperatorDef *op_def,
+      const std::unordered_map<std::string, MemoryType> *mem_types = nullptr);
 
   const std::vector<MemoryBlock> &mem_blocks() const;
 
-  const std::unordered_map<std::string,
-                           std::pair<int, DataType>> &tensor_mem_map() const;
+  const std::unordered_map<std::string, TensorMemInfo> &tensor_mem_map() const;
 
   std::string DebugInfo() const;
 
  private:
-  MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
-                                DataType dt,
-                                MemoryType mem_type);
+  MemoryBlock CreateMemoryBlock(
+      const OperatorDef *op_def,
+      int output_idx,
+      DataType dt,
+      MemoryType mem_type);
 
  private:
   std::unordered_map<std::string, int> tensor_ref_count_;
   std::vector<MemoryBlock> mem_blocks_;
   // tensor name : <mem_id, data_type>
   // Buffer Memory do not different data type, so store the data type.
-  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<std::string, TensorMemInfo> tensor_mem_map_;
   std::unordered_map<int, int> mem_ref_count_;
   std::set<int> idle_blocks_;
 };
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 5ff777b0607715ac5caa9a3beb40c17841b00d3a..fbe1c1b8b9da81929732a77c176195f29dd688b9 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -19,14 +19,17 @@
 #include <utility>
 
 #include "mace/core/future.h"
-#include "mace/core/macros.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/core/op_context.h"
 #include "mace/public/mace.h"
-#include "mace/utils/memory_logging.h"
+#include "mace/port/env.h"
+#include "mace/utils/conf_util.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/macros.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 #include "mace/utils/timer.h"
-#include "mace/utils/utils.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -38,12 +41,15 @@ namespace {
 struct InternalOutputInfo {
   InternalOutputInfo(const MemoryType mem_type,
                      const DataType dtype,
+                     const DataFormat data_format,
                      const std::vector<index_t> &shape,
                      int op_idx)
-      : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
+      : mem_type(mem_type), dtype(dtype), data_format(data_format),
+        shape(shape), op_idx(op_idx) {}
 
   MemoryType mem_type;  // transformed memory type
   DataType dtype;
+  DataFormat data_format;
   std::vector<index_t> shape;  // tensor shape
   int op_idx;  // operation which generate the tensor
 };
@@ -70,12 +76,12 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
     const OpRegistryBase *op_registry,
     OpConstructContext *construct_context,
     std::shared_ptr<OperatorDef> op_def,
-    DataFormat data_format_flag,
+    bool has_data_format,
     bool is_quantize_model) {
   // Create the Operation
   DeviceType target_device_type = target_device_->device_type();
   DeviceType device_type = DeviceType::CPU;
-  construct_context->set_device(cpu_device_);
+  construct_context->set_device(cpu_device_.get());
   construct_context->set_operator_def(op_def);
   construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
   // Get available devices
@@ -100,8 +106,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
   if (!is_quantize_model && device_type == DeviceType::CPU &&
       op_def->output_shape_size() == op_def->output_size()) {
     for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
-      if (data_format_flag == NHWC &&
-          op_def->output_shape(out_idx).dims_size() == 4) {
+      if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
         //  NHWC -> NCHW
         std::vector<index_t> output_shape =
             TransposeShape<index_t, index_t>(
@@ -115,9 +120,8 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
       }
     }
   }
-  std::unique_ptr<Operation> op(
-      op_registry->CreateOperation(construct_context, device_type));
-  return std::move(op);
+
+  return op_registry->CreateOperation(construct_context, device_type);
 }
 
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
@@ -129,17 +133,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
       ws_(ws),
       target_device_(target_device),
       cpu_device_(
-          new CPUDevice(target_device->cpu_runtime()->num_threads(),
-                        target_device->cpu_runtime()->policy(),
-                        target_device->cpu_runtime()->use_gemmlowp())) {
+          make_unique<CPUDevice>(
+              target_device->cpu_runtime()->num_threads(),
+              target_device->cpu_runtime()->policy(),
+              target_device->cpu_runtime()->use_gemmlowp())) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // output tensor : related information
-  std::unordered_map<std::string, InternalOutputInfo> output_map;
-  // used for memory optimization
-  std::unordered_map<std::string, MemoryType> output_mem_map;
-  std::unordered_set<std::string> transformed_set;
-  // add input information
-  MemoryType target_mem_type;
   // quantize model flag
   bool is_quantize_model = IsQuantizedModel(*net_def);
   // Tensor Shape map
@@ -149,20 +147,18 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
       continue;
     }
     for (int i = 0; i < op.output_size(); ++i) {
-      tensor_shape_map[op.output(i)] =
-          std::move(std::vector<index_t>(op.output_shape(i).dims().begin(),
-                                         op.output_shape(i).dims().end()));
+      tensor_shape_map[op.output(i)] = std::vector<index_t>(
+          op.output_shape(i).dims().begin(),
+          op.output_shape(i).dims().end());
     }
   }
   for (auto &tensor : net_def->tensors()) {
     tensor_shape_map[tensor.name()] =
-        std::move(std::vector<index_t>(tensor.dims().begin(),
-                                       tensor.dims().end()));
+      std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
   }
 
-  DataFormat data_format_flag = NHWC;
+  bool has_data_format = false;
   if (target_device_->device_type() == DeviceType::CPU) {
-    target_mem_type = MemoryType::CPU_BUFFER;
     for (auto &input_info : net_def->input_info()) {
       std::vector<index_t> input_shape =
           std::vector<index_t>(input_info.dims().begin(),
@@ -170,38 +166,45 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
       // update tensor shape map
       tensor_shape_map[input_info.name()] = input_shape;
       // Only could be NONE or NHWC
-      auto input_data_format = static_cast<DataFormat>(
+      DataFormat input_data_format = static_cast<DataFormat>(
           input_info.data_format());
-      if (!is_quantize_model && input_data_format == NHWC &&
+      has_data_format = has_data_format ||
+          (input_data_format != DataFormat::DF_NONE);
+      if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
           input_info.dims_size() == 4) {
         // NHWC -> NCHW
         input_shape =
             TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
-      } else if (input_data_format == DataFormat::DF_NONE) {
-        data_format_flag = DataFormat::DF_NONE;
       }
-      output_map.emplace(input_info.name(), InternalOutputInfo(
-          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
     }
   }
-
 #ifdef MACE_ENABLE_OPENCL
-  else {  // GPU  NOLINT[readability/braces]
+  // output tensor : related information
+  std::unordered_map<std::string, InternalOutputInfo> output_map;
+  // used for memory optimization
+  std::unordered_map<std::string, MemoryType> output_mem_map;
+  std::unordered_set<std::string> transformed_set;
+  // add input information
+  MemoryType target_mem_type;
+  // default data format of output tensor
+  DataFormat default_output_df = DataFormat::DF_NONE;
+  if (target_device_->device_type() == DeviceType::GPU) {
     target_mem_type = MemoryType::GPU_BUFFER;
     for (auto &input_info : net_def->input_info()) {
-      auto input_data_format = static_cast<DataFormat>(
+      DataFormat input_data_format = static_cast<DataFormat>(
           input_info.data_format());
-      if (input_data_format == DataFormat::DF_NONE) {
-        data_format_flag = DataFormat::DF_NONE;
-      }
+      has_data_format = input_data_format != DataFormat::DF_NONE;
       std::vector<index_t> input_shape =
           std::vector<index_t>(input_info.dims().begin(),
                                input_info.dims().end());
       // update tensor shape map
       tensor_shape_map[input_info.name()] = input_shape;
       output_map.emplace(input_info.name(), InternalOutputInfo(
-          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+          target_mem_type, DataType::DT_FLOAT, input_data_format,
+          input_shape, -1));
     }
+    default_output_df =
+        has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
   }
 #endif  // MACE_ENABLE_OPENCL
 
@@ -212,7 +215,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
     auto op = CreateOperation(op_registry,
                               &construct_context,
                               op_def,
-                              data_format_flag,
+                              has_data_format,
                               is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
     // Add input transform operation if necessary
@@ -246,11 +249,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                         << output_info.mem_type << " to "
                         << wanted_in_mem_type
                         << ", from Data Type " << output_info.dtype << " to "
-                        << wanted_in_dt;
+                        << wanted_in_dt << ". with data format "
+                        << output_info.data_format;
                 std::string input_name = op_def->input(i);
                 op_def->set_input(i, t_input_name);
                 auto input_shape = output_info.shape;
                 if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                    output_info.data_format == DataFormat::NCHW &&
                     input_shape.size() == 4) {
                   // NCHW -> NHWC
                   input_shape =
@@ -258,14 +263,15 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                                                        {0, 2, 3, 1});
                 }
                 auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-                    input_name, input_shape, t_input_name,
-                    wanted_in_dt, wanted_in_mem_type, data_format_flag);
+                    input_name, input_shape, t_input_name, wanted_in_dt,
+                    construct_context.GetInputOpenCLBufferType(i),
+                    wanted_in_mem_type, has_data_format);
                 OpConstructContext t_construct_context(ws_);
                 auto transform_op = CreateOperation(
                     op_registry,
                     &t_construct_context,
                     transform_op_def,
-                    data_format_flag);
+                    has_data_format);
                 operators_.emplace_back(std::move(transform_op));
                 transformed_set.insert(t_input_name);
                 output_mem_map[t_input_name] = wanted_in_mem_type;
@@ -299,6 +305,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
             InternalOutputInfo(
                 out_mem_type,
                 dt,
+                default_output_df,
                 op_def->output_shape().empty() ?
                 std::vector<index_t>() :
                 std::vector<index_t>(
@@ -340,20 +347,21 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
             output_mem_map[output_info.name()] = target_mem_type;
           }
         }
-        auto output_data_format =
+        bool output_has_data_format =
             static_cast<DataFormat>(output_info.data_format());
         auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
             t_output_name,
             internal_output_info.shape,
             output_info.name(),
             output_info.data_type(),
+            OpenCLBufferType::IN_OUT_CHANNEL,
             target_mem_type,
-            data_format_flag);
+            output_has_data_format);
         auto transform_op = CreateOperation(
             op_registry,
             &construct_context,
             transform_op_def,
-            output_data_format);
+            output_has_data_format);
         operators_.emplace_back(std::move(transform_op));
         // where to do graph reference count.
         mem_optimizer->UpdateTensorRef(transform_op_def.get());
@@ -370,7 +378,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   for (auto &op : operators_) {
     VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
             << ", " << op->debug_def().type() << ">";
-    mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
+#ifdef MACE_ENABLE_OPENCL
+    mem_optimizer->Optimize(op->operator_def().get(), &output_mem_map);
+#else
+    mem_optimizer->Optimize(op->operator_def().get());
+#endif  // MACE_ENABLE_OPENCL
   }
   VLOG(1) << mem_optimizer->DebugInfo();
 }
@@ -384,7 +396,7 @@ MaceStatus SerialNet::Init() {
     if (device_type == target_device_->device_type()) {
       init_context.set_device(target_device_);
     } else {
-      init_context.set_device(cpu_device_);
+      init_context.set_device(cpu_device_.get());
     }
     // Initialize the operation
     MACE_RETURN_IF_ERROR(op->Init(&init_context));
@@ -395,7 +407,7 @@ MaceStatus SerialNet::Init() {
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
-  OpContext context(ws_, cpu_device_);
+  OpContext context(ws_, cpu_device_.get());
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
     DeviceType device_type = op->device_type();
@@ -408,7 +420,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     if (device_type == target_device_->device_type()) {
       context.set_device(target_device_);
     } else {
-      context.set_device(cpu_device_);
+      context.set_device(cpu_device_.get());
     }
 
     CallStats call_stats;
@@ -452,7 +464,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
         bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
         kernels = op->Input(0)->shape();
         if (transpose_a) {
-          std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
+          std::swap(kernels[kernels.size() - 2], kernels[kernels.size() - 1]);
         }
       } else if (type.compare("FullyConnected") == 0) {
         kernels = op->Input(1)->shape();
@@ -472,7 +484,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     VLOG(3) << "Operator " << op->debug_def().name()
             << " has shape: " << MakeString(op->Output(0)->shape());
 
-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE")) {
+    if (EnvConfEnabled("MACE_LOG_TENSOR_RANGE")) {
       for (int i = 0; i < op->OutputSize(); ++i) {
         if (op->debug_def().quantize_info_size() == 0) {
           int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
@@ -498,16 +510,16 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
             Tensor::MappingGuard guard(op->Output(i));
             auto *output_data = op->Output(i)->data<float>();
             for (index_t j = 0; j < op->Output(i)->size(); ++j) {
-                int index = static_cast<int>((output_data[j] - min_v) / bin_v);
-                if (index < 0)
-                  index = 0;
-                else if (index > bin_size-1)
-                  index = bin_size-1;
-                bin_distribution[index]++;
+              int index = static_cast<int>((output_data[j] - min_v) / bin_v);
+              if (index < 0)
+                index = 0;
+              else if (index > bin_size - 1)
+                index = bin_size - 1;
+              bin_distribution[index]++;
             }
             LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
-                        << "@@" << min_v << "," << max_v<< "@@"
-                        << MakeString(bin_distribution);
+                      << "@@" << min_v << "," << max_v << "@@"
+                      << MakeString(bin_distribution);
           }
         }
       }
diff --git a/mace/core/net.h b/mace/core/net.h
index 9945d04637d5eafa402297462b3e9adf1375abdd..788eb611a54158791f988d446153b4b50ef8a59e 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,14 +59,14 @@ class SerialNet : public NetBase {
       const OpRegistryBase *op_registry,
       OpConstructContext *construct_context,
       std::shared_ptr<OperatorDef> op_def,
-      DataFormat input_format,
+      bool has_data_format,
       bool is_quantize_model = false);
 
  protected:
   Workspace *ws_;
   Device *target_device_;
   // CPU is base device.
-  Device *cpu_device_;
+  std::unique_ptr<Device> cpu_device_;
   std::vector<std::unique_ptr<Operation> > operators_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 319b0548d6b75794c3061862ee62599af38cdd7f..8fae1bd8a710f0fb9f6536960ae195ab6b94cba1 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -86,6 +86,27 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
   return input_data_types_[idx];
 }
 
+#ifdef MACE_ENABLE_OPENCL
+void OpConstructContext::SetInputOpenCLBufferType(
+    size_t idx, OpenCLBufferType buffer_type) {
+  if (input_opencl_buffer_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_opencl_buffer_types_.resize(operator_def_->input_size(),
+                               OpenCLBufferType::IN_OUT_CHANNEL);
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  input_opencl_buffer_types_[idx] = buffer_type;
+}
+OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
+    size_t idx) const {
+  if (input_opencl_buffer_types_.empty()) {
+    return OpenCLBufferType::IN_OUT_CHANNEL;
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  return input_opencl_buffer_types_[idx];
+}
+#endif  // MACE_ENABLE_OPENCL
+
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
     : ws_(ws), device_(device) {}
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 03a0f0749954b052b9b2dae558c0fed36612f5e5..e59af9ab166a5ace99bc7cc59b17a025cc0b1645 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -26,6 +26,9 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 
@@ -72,6 +75,11 @@ class OpConstructContext {
 
   DataType GetInputDataType(size_t idx) const;
 
+#ifdef MACE_ENABLE_OPENCL
+  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
+  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
+#endif  // MACE_ENABLE_OPENCL
+
  private:
   std::shared_ptr<OperatorDef> operator_def_;
   Workspace *ws_;
@@ -81,6 +89,9 @@ class OpConstructContext {
   std::vector<MemoryType> input_mem_types_;
   std::vector<DataType> input_data_types_;
   MemoryType output_mem_type_;  // there is only one output memory type now.
+#ifdef MACE_ENABLE_OPENCL
+  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
+#endif  // MACE_ENABLE_OPENCL
 };
 
 // memory_optimizer, device
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index c4ed389265a9881fd6505476ffe45f5852f1bc15..5db5b36b1bb8bd2d2399f1cfa4ba406e78654a40 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,9 +18,6 @@
 #include <omp.h>
 #endif
 
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
 #include <algorithm>
 #include <cerrno>
 #include <cstring>
@@ -29,8 +26,9 @@
 #include <utility>
 #include <vector>
 
-#include "mace/core/macros.h"
+#include "mace/port/env.h"
 #include "mace/public/mace.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
@@ -42,101 +40,36 @@ struct CPUFreq {
   float freq;
 };
 
-namespace {
-
-int GetCPUCount() {
-  int cpu_count = 0;
-  std::string cpu_sys_conf = "/proc/cpuinfo";
-  std::ifstream f(cpu_sys_conf);
-  if (!f.is_open()) {
-    LOG(ERROR) << "failed to open " << cpu_sys_conf;
-    return -1;
-  }
-  std::string line;
-  const std::string processor_key = "processor";
-  while (std::getline(f, line)) {
-    if (line.size() >= processor_key.size()
-        && line.compare(0, processor_key.size(), processor_key) == 0) {
-      ++cpu_count;
-    }
-  }
-  if (f.bad()) {
-    LOG(ERROR) << "failed to read " << cpu_sys_conf;
-  }
-  if (!f.eof()) {
-    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
-  }
-  f.close();
-  VLOG(2) << "CPU cores: " << cpu_count;
-  return cpu_count;
-}
-
-int GetCPUMaxFreq(std::vector<float> *max_freqs) {
-  int cpu_count = GetCPUCount();
-  for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
-    std::string cpuinfo_max_freq_sys_conf = MakeString(
-        "/sys/devices/system/cpu/cpu",
-        cpu_id,
-        "/cpufreq/cpuinfo_max_freq");
-    std::ifstream f(cpuinfo_max_freq_sys_conf);
-    if (!f.is_open()) {
-      LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
-      return -1;
-    }
-    std::string line;
-    if (std::getline(f, line)) {
-      float freq = strtof(line.c_str(), nullptr);
-      max_freqs->push_back(freq);
-    }
-    if (f.bad()) {
-      LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
-    }
-    f.close();
-  }
-
-  for (float freq : *max_freqs) {
-    VLOG(2) << "CPU freq: " << freq;
-  }
-
-  return 0;
-}
+enum SchedulePolicy {
+  SCHED_STATIC,
+  SCHED_GUIDED,
+};
 
-MaceStatus SetThreadAffinity(cpu_set_t mask) {
-#if defined(__ANDROID__)
-  pid_t pid = gettid();
-#else
-  pid_t pid = syscall(SYS_gettid);
-#endif
-  int err = sched_setaffinity(pid, sizeof(mask), &mask);
-  if (err) {
-    LOG(WARNING) << "set affinity error: " << strerror(errno);
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "set affinity error: " + std::string(strerror(errno)));
-  } else {
-    return MaceStatus::MACE_SUCCESS;
-  }
-}
+namespace {
 
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<size_t> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids,
+                                           SchedulePolicy schedule_policy) {
   MaceOpenMPThreadCount = omp_num_threads;
 
 #ifdef MACE_ENABLE_OPENMP
   VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
           << ", CPU core IDs: " << MakeString(cpu_ids);
-  omp_set_schedule(omp_sched_guided, 1);
+  if (schedule_policy == SCHED_GUIDED) {
+    omp_set_schedule(omp_sched_guided, 1);
+  } else if (schedule_policy == SCHED_STATIC) {
+    omp_set_schedule(omp_sched_static, 0);
+  } else {
+    LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
+  }
+
   omp_set_num_threads(omp_num_threads);
 #else
   MACE_UNUSED(omp_num_threads);
+  MACE_UNUSED(schedule_policy);
   LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
 
-  // compute mask
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (auto cpu_id : cpu_ids) {
-    CPU_SET(cpu_id, &mask);
-  }
 #ifdef MACE_ENABLE_OPENMP
   std::vector<MaceStatus> status(omp_num_threads,
                                  MaceStatus::MACE_INVALID_ARGS);
@@ -144,7 +77,7 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
   for (int i = 0; i < omp_num_threads; ++i) {
     VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
             << "/" << omp_get_num_threads();
-    status[i] = SetThreadAffinity(mask);
+    status[i] = SchedSetAffinity(cpu_ids);
   }
   for (int i = 0; i < omp_num_threads; ++i) {
     if (status[i] != MaceStatus::MACE_SUCCESS)
@@ -152,8 +85,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
   }
   return MaceStatus::MACE_SUCCESS;
 #else
-  MaceStatus status = SetThreadAffinity(mask);
-  VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
+  MaceStatus status = SchedSetAffinity(cpu_ids);
+  VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids);
   return status;
 #endif
 }
@@ -166,8 +99,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
     void *gemm_context) {
   // get cpu frequency info
   std::vector<float> cpu_max_freqs;
-  if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
-    return MaceStatus::MACE_INVALID_ARGS;
+  MACE_RETURN_IF_ERROR(GetCPUMaxFreq(&cpu_max_freqs));
+  if (cpu_max_freqs.empty()) {
+    return MaceStatus::MACE_RUNTIME_ERROR;
   }
 
   std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
@@ -228,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
   } else {
     cores_to_use = num_threads_hint;
   }
+  MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
 
   VLOG(2) << "Use " << num_threads_hint << " threads";
   std::vector<size_t> cpu_ids(cores_to_use);
@@ -236,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
             << cpu_freq[i].freq;
     cpu_ids[i] = cpu_freq[i].core_id;
   }
+  SchedulePolicy sched_policy = SCHED_GUIDED;
+  if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
+    sched_policy = SCHED_STATIC;
+  }
 
 #ifdef MACE_ENABLE_QUANTIZE
   if (gemm_context) {
@@ -244,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
   }
 #endif  // MACE_ENABLE_QUANTIZE
 
-  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
+                                         cpu_ids,
+                                         sched_policy);
 }
 
 }  // namespace mace
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 95fee27f5424eeed2f29eb782bd085115ab430c9..ab067ebaae698e2296dcee5469c93961f654b628 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -22,7 +22,7 @@
 #include "public/gemmlowp.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
 
@@ -52,13 +52,13 @@ class CPURuntime {
 
 #ifdef MACE_ENABLE_QUANTIZE
   ~CPURuntime() {
-    if (!gemm_context_) {
+    if (gemm_context_ != nullptr) {
       delete static_cast<gemmlowp::GemmContext*>(gemm_context_);
     }
   }
 
   gemmlowp::GemmContext *GetGemmlowpContext() {
-    if (!gemm_context_) {
+    if (gemm_context_ == nullptr) {
       gemm_context_ = new gemmlowp::GemmContext();
     }
     return static_cast<gemmlowp::GemmContext*>(gemm_context_);
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
index 1674e6cfdeefd3cfb1df9f5c71383715a6c3b1ba..eda740f400e47bab5fac2ab04057522ad9f9b7ce 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -15,49 +15,68 @@
 #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
 
+#include <memory>
+#include <utility>
 #include <vector>
 
-#include "mace/core/runtime/hexagon/quantize.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "third_party/nnlib/hexagon_nn.h"
 
 namespace mace {
 
+struct InOutInfo {
+  InOutInfo(const std::vector<index_t> &shape,
+            const DataType data_type,
+            const float scale,
+            const int32_t zero_point,
+            std::unique_ptr<Tensor> tensor_u8)
+      :  shape(shape),
+         data_type(data_type),
+         scale(scale),
+         zero_point(zero_point),
+         tensor_u8(std::move(tensor_u8)) {}
+
+  std::vector<index_t> shape;
+  DataType data_type;
+  float scale;
+  int32_t zero_point;
+  std::unique_ptr<Tensor> tensor_u8;
+};
+
 class HexagonControlWrapper {
  public:
-  HexagonControlWrapper() {}
-  int GetVersion();
-  bool Config();
-  bool Init();
-  bool Finalize();
-  bool SetupGraph(const NetDef &net_def, const unsigned char *model_data);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
-                       std::vector<Tensor *> *output_tensors);
+  HexagonControlWrapper() = default;
+  virtual ~HexagonControlWrapper() = default;
 
-  bool TeardownGraph();
-  void PrintLog();
-  void PrintGraph();
-  void GetPerfInfo();
-  void ResetPerfInfo();
-  void SetDebugLevel(int level);
+  virtual int GetVersion() = 0;
+  virtual bool Config() = 0;
+  virtual bool Init() = 0;
+  virtual bool Finalize() = 0;
+  virtual bool SetupGraph(const NetDef &net_def,
+                          const unsigned char *model_data) = 0;
+  virtual bool ExecuteGraph(const Tensor &input_tensor,
+                            Tensor *output_tensor) = 0;
+  virtual bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                               std::vector<Tensor *> *output_tensors) = 0;
+  virtual bool TeardownGraph() = 0;
+  virtual void PrintLog() = 0;
+  virtual void PrintGraph() = 0;
+  virtual void GetPerfInfo() = 0;
+  virtual void ResetPerfInfo() = 0;
+  virtual void SetDebugLevel(int level) = 0;
 
- private:
-  static constexpr int NODE_ID_OFFSET = 10000;
-  static constexpr int NUM_METADATA = 4;
+ protected:
+  static constexpr int kNodeIdOffset = 10000;
+  static constexpr int kNumMetaData = 4;
 
-  inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
+  inline uint32_t node_id(uint32_t nodeid) { return kNodeIdOffset + nodeid; }
 
   int nn_id_;
-  Quantizer quantizer_;
 
-  std::vector<std::vector<index_t>> input_shapes_;
-  std::vector<std::vector<index_t>> output_shapes_;
-  std::vector<DataType> input_data_types_;
-  std::vector<DataType> output_data_types_;
-  uint32_t num_inputs_;
-  uint32_t num_outputs_;
+  std::vector<InOutInfo> input_info_;
+  std::vector<InOutInfo> output_info_;
+  int num_inputs_;
+  int num_outputs_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h
index 0c933ae0b6ff2171008058cc074c293e1909b819..f80607d3196582f850d0911fec0429784cabaca0 100644
--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -15,18 +15,55 @@
 #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
 
+#include <memory>
+#include <utility>
+
 #include "mace/core/device.h"
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#ifdef MACE_ENABLE_HEXAGON
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
+#endif
+#ifdef MACE_ENABLE_HTA
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+#endif
 
 namespace mace {
 
 class HexagonDevice : public CPUDevice {
  public:
-  HexagonDevice() : CPUDevice(0, AFFINITY_NONE, false) {}
+  explicit HexagonDevice(DeviceType device_type)
+      : CPUDevice(0, AFFINITY_NONE, false),
+        device_type_(device_type) {}
 
   DeviceType device_type() const override {
-    return DeviceType::HEXAGON;
+    return device_type_;
   };
+
+ private:
+  DeviceType device_type_;
 };
 
+std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
+    DeviceType device_type) {
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
+
+  switch (device_type) {
+#ifdef MACE_ENABLE_HEXAGON
+    case HEXAGON:
+      hexagon_controller = make_unique<HexagonDSPWrapper>();
+      break;
+#endif
+#ifdef MACE_ENABLE_HTA
+    case HTA:
+      hexagon_controller = make_unique<HexagonHTAWrapper>();
+      break;
+#endif
+    default:
+      LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+  }
+
+  return hexagon_controller;
+}
+
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_dsp_ops.h
similarity index 89%
rename from mace/core/runtime/hexagon/hexagon_nn_ops.h
rename to mace/core/runtime/hexagon/hexagon_dsp_ops.h
index 3ebedb8eb8d81850cd29383fd7667c42b2369262..1f50e13cb48bb8133fc31d71752a623fed16217f 100644
--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_dsp_ops.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
-#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
 
 #include <string>
 #include <unordered_map>
@@ -57,4 +57,4 @@ class OpMap {
 };
 }  // namespace mace
 
-#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
similarity index 84%
rename from mace/core/runtime/hexagon/hexagon_control_wrapper.cc
rename to mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
index 5e0cb77213316f29b7f7f08a54d6380696d131a5..a98d9ad1499251a15d7b969cecee2eaf28f84347 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -12,26 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
+#include <memory>
 #include <thread>  // NOLINT(build/c++11)
 #include <vector>
 #include <unordered_map>
 #include <string>
 #include <utility>
 
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
-#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_ops.h"
 #include "mace/core/types.h"
-
-namespace {
-inline int64_t NowMicros() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-}
-}
+#include "mace/port/env.h"
+#include "mace/utils/memory.h"
+#include "third_party/nnlib/hexagon_nn.h"
 
 namespace mace {
 
@@ -92,33 +87,33 @@ std::string FloatToString(const FloatType v, const int32_t precision) {
 }
 }  // namespace
 
-int HexagonControlWrapper::GetVersion() {
+int HexagonDSPWrapper::GetVersion() {
   int version;
   MACE_CHECK(hexagon_nn_version(&version) == 0, "get version error");
   return version;
 }
 
-bool HexagonControlWrapper::Config() {
+bool HexagonDSPWrapper::Config() {
   LOG(INFO) << "Hexagon config";
   MACE_CHECK(hexagon_nn_set_powersave_level(0) == 0, "hexagon power error");
   MACE_CHECK(hexagon_nn_config() == 0, "hexagon config error");
   return true;
 }
 
-bool HexagonControlWrapper::Init() {
+bool HexagonDSPWrapper::Init() {
   LOG(INFO) << "Hexagon init";
   MACE_CHECK(hexagon_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
   ResetPerfInfo();
   return true;
 }
 
-bool HexagonControlWrapper::Finalize() {
+bool HexagonDSPWrapper::Finalize() {
   LOG(INFO) << "Hexagon finalize";
   return hexagon_nn_set_powersave_level(1) == 0;
 }
 
-bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
-                                       unsigned const char *model_data) {
+bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
+                                   unsigned const char *model_data) {
   LOG(INFO) << "Hexagon setup graph";
 
   int64_t t0 = NowMicros();
@@ -236,29 +231,35 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
   cached_outputs.clear();
 
   // input info
-  num_inputs_ = 0;
-  for (const InputInfo &input_info : net_def.input_info()) {
+  num_inputs_ = net_def.input_info_size();
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
     std::vector<index_t> input_shape(input_info.dims().begin(),
                                      input_info.dims().end());
     while (input_shape.size() < 4) {
       input_shape.insert(input_shape.begin(), 1);
     }
-    input_shapes_.push_back(input_shape);
-    input_data_types_.push_back(input_info.data_type());
-    num_inputs_ += 1;
+    input_info_.emplace_back(input_shape,
+                             input_info.data_type(),
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
   }
 
   // output info
-  num_outputs_ = 0;
-  for (const OutputInfo &output_info : net_def.output_info()) {
+  num_outputs_ = net_def.output_info_size();
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
     std::vector<index_t> output_shape(output_info.dims().begin(),
                                       output_info.dims().end());
     while (output_shape.size() < 4) {
       output_shape.insert(output_shape.begin(), 1);
     }
-    output_shapes_.push_back(output_shape);
-    output_data_types_.push_back(output_info.data_type());
-    num_outputs_ += 1;
+    output_info_.emplace_back(output_shape,
+                              output_info.data_type(),
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
     VLOG(1) << "OutputInfo: "
             << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
             << " " << output_shape[2] << " " << output_shape[3]
@@ -276,14 +277,14 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
   return true;
 }
 
-bool HexagonControlWrapper::TeardownGraph() {
+bool HexagonDSPWrapper::TeardownGraph() {
   LOG(INFO) << "Hexagon teardown graph";
   return hexagon_nn_teardown(nn_id_) == 0;
 }
 
 #define MACE_PRINT_BUFSIZE (2 * 1024 * 1024)
 
-void HexagonControlWrapper::PrintLog() {
+void HexagonDSPWrapper::PrintLog() {
   char *buf;
   if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
   MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
@@ -293,7 +294,7 @@ void HexagonControlWrapper::PrintLog() {
   delete[] buf;
 }
 
-void HexagonControlWrapper::PrintGraph() {
+void HexagonDSPWrapper::PrintGraph() {
   LOG(INFO) << "Print Graph";
   char *buf;
   if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
@@ -304,13 +305,13 @@ void HexagonControlWrapper::PrintGraph() {
   delete[] buf;
 }
 
-void HexagonControlWrapper::SetDebugLevel(int level) {
+void HexagonDSPWrapper::SetDebugLevel(int level) {
   LOG(INFO) << "Set debug level: " << level;
   MACE_CHECK(hexagon_nn_set_debug_level(nn_id_, level) == 0,
              "set debug level error");
 }
 
-void HexagonControlWrapper::GetPerfInfo() {
+void HexagonDSPWrapper::GetPerfInfo() {
   LOG(INFO) << "Get perf info";
   std::vector<hexagon_nn_perfinfo> perf_info(MACE_MAX_NODE);
   unsigned int n_items = 0;
@@ -385,20 +386,20 @@ void HexagonControlWrapper::GetPerfInfo() {
   LOG(INFO) << "total duration: " << std::fixed << total_duration;
 }
 
-void HexagonControlWrapper::ResetPerfInfo() {
+void HexagonDSPWrapper::ResetPerfInfo() {
   LOG(INFO) << "Reset perf info";
   MACE_CHECK(hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME) == 0,
              "reset perf error");
 }
 
-bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
-                                         Tensor *output_tensor) {
+bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                     Tensor *output_tensor) {
   VLOG(2) << "Execute graph: " << nn_id_;
   // single input and single output
   MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
   MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
-  output_tensor->SetDtype(output_data_types_[0]);
-  output_tensor->Resize(output_shapes_[0]);
+  output_tensor->SetDtype(output_info_[0].data_type);
+  output_tensor->Resize(output_info_[0].shape);
   std::vector<uint32_t> output_shape(4);
   uint32_t output_bytes;
   int res = hexagon_nn_execute(
@@ -418,10 +419,11 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
       &output_bytes);
   MACE_CHECK(res == 0, "execute error");
 
-  MACE_ASSERT(output_shape.size() == output_shapes_[0].size(),
+  MACE_ASSERT(output_shape.size() == output_info_[0].shape.size(),
               "wrong output shape inferred");
   for (size_t i = 0; i < output_shape.size(); ++i) {
-    MACE_ASSERT(static_cast<index_t>(output_shape[i]) == output_shapes_[0][i],
+    MACE_ASSERT(static_cast<index_t>(output_shape[i])
+                    == output_info_[0].shape[i],
                 "wrong output shape inferred");
   }
   MACE_ASSERT(output_bytes == output_tensor->raw_size(),
@@ -429,7 +431,7 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
   return res == 0;
 }
 
-bool HexagonControlWrapper::ExecuteGraphNew(
+bool HexagonDSPWrapper::ExecuteGraphNew(
     const std::vector<Tensor *> &input_tensors,
     std::vector<Tensor *> *output_tensors) {
   VLOG(2) << "Execute graph new: " << nn_id_;
@@ -438,14 +440,15 @@ bool HexagonControlWrapper::ExecuteGraphNew(
   MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
   MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
 
-  std::vector<hexagon_nn_tensordef> inputs(num_inputs * NUM_METADATA);
-  std::vector<hexagon_nn_tensordef> outputs(num_outputs * NUM_METADATA);
+  std::vector<hexagon_nn_tensordef> inputs(num_inputs * kNumMetaData);
+  std::vector<hexagon_nn_tensordef> outputs(num_outputs * kNumMetaData);
   std::vector<InputOutputMetadata> input_metadata(num_inputs);
   std::vector<InputOutputMetadata> output_metadata(num_outputs);
 
+  // transform mace input to hexagon input
   for (size_t i = 0; i < num_inputs; ++i) {
     std::vector<index_t> input_shape = input_tensors[i]->shape();
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
     inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
     inputs[index].height = static_cast<uint32_t>(input_shape[1]);
     inputs[index].width = static_cast<uint32_t>(input_shape[2]);
@@ -453,8 +456,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(
     inputs[index].data = const_cast<unsigned char *>(
         reinterpret_cast<const unsigned char *>(input_tensors[i]->raw_data()));
     inputs[index].dataLen = static_cast<int>(input_tensors[i]->raw_size());
-    inputs[index].data_valid_len = static_cast<uint32_t>(
-        input_tensors[i]->raw_size());
+    inputs[index].data_valid_len =
+        static_cast<uint32_t>(input_tensors[i]->raw_size());
     inputs[index].unused = 0;
     input_metadata[i].Init(.0f, .0f, 1);
     AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
@@ -462,38 +465,44 @@ bool HexagonControlWrapper::ExecuteGraphNew(
     AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
   }
 
+  // transform mace output to hexagon output
   for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
-    (*output_tensors)[i]->SetDtype(output_data_types_[i]);
-    (*output_tensors)[i]->Resize(output_shapes_[i]);
+    size_t index = i * kNumMetaData;
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
+
     outputs[index].data = reinterpret_cast<unsigned char *>(
         (*output_tensors)[i]->raw_mutable_data());
     outputs[index].dataLen = static_cast<int>((*output_tensors)[i]->raw_size());
     output_metadata[i].Init(.0f, .0f, 1);
+
     AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
     AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
     AddOutputMetadata(output_metadata[i].needs_quantization,
                       &outputs[index + 3]);
   }
 
+  // Execute graph
   int res = hexagon_nn_execute_new(nn_id_,
                                    inputs.data(),
-                                   num_inputs * NUM_METADATA,
+                                   num_inputs * kNumMetaData,
                                    outputs.data(),
-                                   num_outputs * NUM_METADATA);
+                                   num_outputs * kNumMetaData);
 
+  // handle hexagon output
   for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
     std::vector<uint32_t> output_shape{
         outputs[index].batches, outputs[index].height, outputs[index].width,
         outputs[index].depth};
-    MACE_ASSERT(output_shape.size() == output_shapes_[i].size(),
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
                 "wrong output shape inferred");
     for (size_t j = 0; j < output_shape.size(); ++j) {
       MACE_ASSERT(static_cast<index_t>(output_shape[j])
-                      == output_shapes_[i][j],
+                      == output_info_[i].shape[j],
                   "wrong output shape inferred");
     }
+
     MACE_ASSERT(static_cast<index_t>(outputs[index].data_valid_len)
                     == (*output_tensors)[i]->raw_size(),
                 "wrong output bytes inferred.");
diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c46414bf390b87af35f2000e2732b0e50663e95
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h
@@ -0,0 +1,51 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
+
+#include <vector>
+
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+
+class HexagonDSPWrapper : public HexagonControlWrapper {
+ public:
+  HexagonDSPWrapper() = default;
+
+  int GetVersion() override;
+  bool Config() override;
+  bool Init() override;
+  bool Finalize() override;
+  bool SetupGraph(const NetDef &net_def,
+                  const unsigned char *model_data) override;
+  bool ExecuteGraph(const Tensor &input_tensor,
+                    Tensor *output_tensor) override;
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                       std::vector<Tensor *> *output_tensors) override;
+  bool TeardownGraph() override;
+  void PrintLog() override;
+  void PrintGraph() override;
+  void GetPerfInfo() override;
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
+};
+}  // namespace mace
+
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
diff --git a/mace/core/runtime/hexagon/hexagon_hta_ops.h b/mace/core/runtime/hexagon/hexagon_hta_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..39a108609d815b2eeaf805d611b5fb4fbd69c564
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_hta_ops.h
@@ -0,0 +1,50 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "mace/utils/logging.h"
+#include "third_party/hta/hta_hexagon_nn_ops.h"
+
+namespace mace {
+
+class OpMap {
+ public:
+  void Init() {
+#define HTA_DEF_OP(NAME) op_map_[#NAME] = HTA_OP_##NAME;
+
+#include "third_party/hta/hta_ops.h"
+
+#undef HTA_DEF_OP
+  }
+
+  hta_op_type GetOpId(const std::string &op_type) {
+    if (op_map_.find(op_type) != end(op_map_)) {
+      return op_map_[op_type];
+    } else {
+      LOG(ERROR) << "HTA unsupported op type: " << op_type;
+      return HTA_NN_OPS_MAX;
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, hta_op_type> op_map_;
+};
+}  // namespace mace
+
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3754f19ca8f0528e0679816cd18c0ccfbb1197a
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
@@ -0,0 +1,318 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+#include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
+#include "mace/core/types.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/quantize.h"
+#include "third_party/hta/hta_hexagon_api.h"
+
+namespace mace {
+
+int HexagonHTAWrapper::GetVersion() {
+  int version;
+  MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error");
+  return version;
+}
+
+bool HexagonHTAWrapper::Config() {
+  LOG(INFO) << "HTA config";
+  MACE_CHECK(hexagon_hta_nn_config() == 0, "hexagon config error");
+  return true;
+}
+
+bool HexagonHTAWrapper::Init() {
+  LOG(INFO) << "Hexagon init";
+  MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
+  ResetPerfInfo();
+  return true;
+}
+
+bool HexagonHTAWrapper::Finalize() {
+  LOG(INFO) << "Hexagon finalize";
+  return true;
+}
+
+bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
+                                   unsigned const char *model_data) {
+  LOG(INFO) << "Hexagon setup graph";
+
+  int64_t t0 = NowMicros();
+
+  // const node
+  for (const ConstTensor &const_tensor : net_def.tensors()) {
+    std::vector<int> tensor_shape(const_tensor.dims().begin(),
+                                  const_tensor.dims().end());
+    while (tensor_shape.size() < 4) {
+      tensor_shape.insert(tensor_shape.begin(), 1);
+    }
+
+    hexagon_nn_const_node const_node;
+    const_node.node_id = node_id(const_tensor.node_id());
+    const_node.tensor.batches = tensor_shape[0];
+    const_node.tensor.height = tensor_shape[1];
+    const_node.tensor.width = tensor_shape[2];
+    const_node.tensor.depth = tensor_shape[3];
+
+    if (const_tensor.data_type() == DataType::DT_INT32 &&
+        const_tensor.data_size() == 0) {
+      const_node.tensor.data = NULL;
+      const_node.tensor.dataLen = 0;
+    } else {
+      const_node.tensor.data =
+          const_cast<unsigned char *>(model_data + const_tensor.offset());
+      const_node.tensor.dataLen = const_tensor.data_size() *
+          GetEnumTypeSize(const_tensor.data_type());
+    }
+
+    hexagon_hta_nn_append_const_node(nn_id_,
+                                     const_node.node_id,
+                                     const_node.tensor.batches,
+                                     const_node.tensor.height,
+                                     const_node.tensor.width,
+                                     const_node.tensor.depth,
+                                     const_node.tensor.data,
+                                     const_node.tensor.dataLen);
+  }
+
+  // op node
+  OpMap op_map;
+  op_map.Init();
+  std::vector<std::vector<hexagon_hta_nn_input>> cached_inputs;
+  std::vector<std::vector<hexagon_hta_nn_output>> cached_outputs;
+  std::vector<hexagon_hta_nn_input> inputs;
+  std::vector<hexagon_hta_nn_output> outputs;
+
+  for (const OperatorDef &op : net_def.op()) {
+    hta_op_type op_id = op_map.GetOpId(op.type());
+    inputs.resize(op.node_input().size());
+    for (int i = 0; i < op.node_input().size(); ++i) {
+      inputs[i].src_id = node_id(op.node_input()[i].node_id());
+      inputs[i].output_idx = op.node_input()[i].output_port();
+    }
+    outputs.resize(op.output_shape().size());
+    for (int i = 0; i < op.output_shape().size(); ++i) {
+      outputs[i].rank = op.output_shape()[i].dims().size();
+      for (size_t j = 0; j < outputs[i].rank; ++j) {
+        outputs[i].max_sizes[j] = op.output_shape()[i].dims()[j];
+      }
+      if (outputs[i].rank == 0) {
+        outputs[i].rank = 1;
+        outputs[i].max_sizes[0] = 1;
+      }
+      outputs[i].max_sizes[outputs[i].rank] = 0;
+      outputs[i].elementsize = GetEnumTypeSize(
+          static_cast<DataType>(op.output_type()[i]));
+      outputs[i].zero_offset = 0;
+      outputs[i].stepsize = 0;
+    }
+    cached_inputs.push_back(inputs);
+    cached_outputs.push_back(outputs);
+
+    auto padding_type = static_cast<hta_padding_type>(op.padding());
+
+    hexagon_nn_op_node op_node;
+    op_node.node_id = node_id(op.node_id());
+    op_node.operation = op_id;
+    op_node.padding = padding_type;
+    op_node.inputs = cached_inputs.back().data();
+    op_node.inputsLen = inputs.size();
+    op_node.outputs = cached_outputs.back().data();
+    op_node.outputsLen = outputs.size();
+
+    hexagon_hta_nn_append_node(nn_id_,
+                               op_node.node_id,
+                               op_node.operation,
+                               op_node.padding,
+                               op_node.inputs,
+                               op_node.inputsLen,
+                               op_node.outputs,
+                               op_node.outputsLen);
+  }
+
+  // input info
+  num_inputs_ = net_def.input_info_size();
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
+    std::vector<index_t> input_shape(input_info.dims().begin(),
+                                     input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_info_.emplace_back(input_shape,
+                             input_info.data_type(),
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
+  }
+
+  // output info
+  num_outputs_ = net_def.output_info_size();
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
+    std::vector<index_t> output_shape(output_info.dims().begin(),
+                                      output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_info_.emplace_back(output_shape,
+                              output_info.data_type(),
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
+    VLOG(1) << "OutputInfo: "
+            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
+            << " " << output_shape[2] << " " << output_shape[3]
+            << "\n\t type: " << output_info.data_type();
+  }
+
+  int64_t t1 = NowMicros();
+
+  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
+
+  int64_t t2 = NowMicros();
+
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+
+  return true;
+}
+
+bool HexagonHTAWrapper::TeardownGraph() {
+  LOG(INFO) << "Hexagon teardown graph";
+  return hexagon_hta_nn_teardown(nn_id_) == 0;
+}
+
+void HexagonHTAWrapper::PrintLog() {
+  LOG(INFO) << "Print Log";
+}
+
+void HexagonHTAWrapper::PrintGraph() {
+  LOG(INFO) << "Print Graph";
+}
+
+void HexagonHTAWrapper::SetDebugLevel(int level) {
+  LOG(INFO) << "Set debug level: " << level;
+  MACE_CHECK(hexagon_hta_nn_set_debug_level(nn_id_, level) == 0,
+             "set debug level error");
+}
+
+void HexagonHTAWrapper::GetPerfInfo() {
+  LOG(INFO) << "Get perf info";
+}
+
+void HexagonHTAWrapper::ResetPerfInfo() {
+  LOG(INFO) << "Reset perf info";
+}
+
+bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                     Tensor *output_tensor) {
+  MACE_UNUSED(input_tensor);
+  MACE_UNUSED(output_tensor);
+  MACE_NOT_IMPLEMENTED;
+  return false;
+}
+
+bool HexagonHTAWrapper::ExecuteGraphNew(
+    const std::vector<Tensor *> &input_tensors,
+    std::vector<Tensor *> *output_tensors) {
+  VLOG(2) << "Execute graph new: " << nn_id_;
+  uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
+  uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+
+  std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs);
+  std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    std::vector<index_t> input_shape = input_tensors[i]->shape();
+    inputs[i].batches = static_cast<uint32_t>(input_shape[0]);
+    inputs[i].height = static_cast<uint32_t>(input_shape[1]);
+    inputs[i].width = static_cast<uint32_t>(input_shape[2]);
+    inputs[i].depth = static_cast<uint32_t>(input_shape[3]);
+    input_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    input_info_[i].tensor_u8->Resize(input_shape);
+
+    const float *input_data = input_tensors[i]->data<float>();
+    uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data<uint8_t>();
+    QuantizeWithScaleAndZeropoint(input_data,
+                                  input_tensors[i]->size(),
+                                  input_info_[i].scale,
+                                  input_info_[i].zero_point,
+                                  input_data_u8);
+
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(
+            input_info_[i].tensor_u8->raw_data()));
+    inputs[i].dataLen = static_cast<int>(input_info_[i].tensor_u8->raw_size());
+    inputs[i].data_valid_len = static_cast<uint32_t>(
+        input_info_[i].tensor_u8->raw_size());
+    inputs[i].unused = 0;
+  }
+
+  for (size_t i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
+    output_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    output_info_[i].tensor_u8->Resize(output_info_[i].shape);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        output_info_[i].tensor_u8->raw_mutable_data());
+    outputs[i].dataLen =
+        static_cast<int>(output_info_[i].tensor_u8->raw_size());
+  }
+
+  int res = hexagon_hta_nn_execute_new(nn_id_,
+                                       inputs.data(),
+                                       num_inputs,
+                                       outputs.data(),
+                                       num_outputs);
+
+  for (size_t i = 0; i < num_outputs; ++i) {
+    std::vector<uint32_t> output_shape{
+        outputs[i].batches, outputs[i].height, outputs[i].width,
+        outputs[i].depth};
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
+                "wrong output shape inferred");
+    for (size_t j = 0; j < output_shape.size(); ++j) {
+      MACE_ASSERT(static_cast<index_t>(output_shape[j])
+                      == output_info_[i].shape[j],
+                  "wrong output shape inferred");
+    }
+    MACE_ASSERT(static_cast<index_t>(outputs[i].data_valid_len)
+                    == (*output_tensors)[i]->raw_size(),
+                "wrong output bytes inferred.");
+
+    const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data<uint8_t>();
+    float *output_data = (*output_tensors)[i]->mutable_data<float>();
+    Dequantize(output_data_u8,
+               output_info_[i].tensor_u8->size(),
+               output_info_[i].scale,
+               output_info_[i].zero_point,
+               output_data);
+  }
+
+  return res == 0;
+}
+
+}  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..26ea17bde45da1853efe222e9f7d30baa25d3471
--- /dev/null
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
@@ -0,0 +1,51 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+
+#include <vector>
+
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+
+class HexagonHTAWrapper : public HexagonControlWrapper {
+ public:
+  HexagonHTAWrapper() = default;
+
+  int GetVersion() override;
+  bool Config() override;
+  bool Init() override;
+  bool Finalize() override;
+  bool SetupGraph(const NetDef &net_def,
+                  const unsigned char *model_data) override;
+  bool ExecuteGraph(const Tensor &input_tensor,
+                    Tensor *output_tensor) override;
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                       std::vector<Tensor *> *output_tensors) override;
+  bool TeardownGraph() override;
+  void PrintLog() override;
+  void PrintGraph() override;
+  void GetPerfInfo() override;
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
+};
+}  // namespace mace
+
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
diff --git a/mace/core/runtime/hexagon/quantize.cc b/mace/core/runtime/hexagon/quantize.cc
deleted file mode 100644
index 31a62288f6bf6b4cec8fd0b5692d427ca9376b94..0000000000000000000000000000000000000000
--- a/mace/core/runtime/hexagon/quantize.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-
-#include "mace/core/runtime/hexagon/quantize.h"
-
-namespace mace {
-
-void Quantizer::Quantize(const Tensor &in_tensor,
-                         Tensor *out_tensor,
-                         float *min_out,
-                         float *max_out) {
-  if (in_tensor.size() == 0) return;
-  const float *in_data = in_tensor.data<float>();
-  float min_in = in_data[0];
-  float max_in = in_data[0];
-  for (index_t i = 0; i < in_tensor.size(); ++i) {
-    min_in = std::min(min_in, in_data[i]);
-    max_in = std::max(max_in, in_data[i]);
-  }
-  Quantize(in_tensor, min_in, max_in, out_tensor, min_out, max_out);
-}
-
-void Quantizer::Quantize(const Tensor &in_tensor,
-                         const float min_in,
-                         const float max_in,
-                         Tensor *out_tensor,
-                         float *min_out,
-                         float *max_out) {
-  float stepsize;
-  float recip_stepsize;
-  QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
-                      &recip_stepsize);
-
-  const float *in = in_tensor.data<float>();
-  uint8_t *out = out_tensor->mutable_data<uint8_t>();
-
-  for (int i = 0; i < in_tensor.size(); i++) {
-    const float inval = in[i];
-    float ival =
-        static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
-    if (ival < 0) ival = 0;
-    if (ival > 255) ival = 255;
-    out[i] = static_cast<uint8_t>(ival);
-  }
-}
-
-void Quantizer::QuantizeAdjustRange(float min_in,
-                                    float max_in,
-                                    float *min_out,
-                                    float *max_out,
-                                    float *stepsize_out,
-                                    float *recip_stepsize_out) {
-  float minval = std::min(0.0f, min_in);
-  float maxval = std::max(0.0f, max_in);
-  float range = std::max(0.0001f, maxval - minval);
-  float recip_stepsize = 255.0f / range;
-  // make z(q0) integer
-  if (minval < 0.0f) {
-    float z = -minval * recip_stepsize;
-    float zi = floorf(z);
-    float zf = z - zi;
-    if (zf > 0.0001f && zf < 0.9999f) {
-      if (zi > 0.0f && (zi >= 254.0f || (zf - 1.0f) * minval > zf * maxval)) {
-        range = -255.0f * minval / zi;
-        maxval = minval + range;
-      } else {
-        range = 255.0f * maxval / (254.0f - zi);
-        minval = maxval - range;
-      }
-      recip_stepsize = 255.0f / range;
-    }
-  }
-
-  *min_out = minval;
-  *max_out = maxval;
-  *stepsize_out = range / 255.0f;
-  *recip_stepsize_out = recip_stepsize;
-}
-
-void Quantizer::DeQuantize(const Tensor &in_tensor,
-                           const float min_in,
-                           const float max_in,
-                           Tensor *out_tensor) {
-  float range = std::max(0.0001f, max_in - min_in);
-  float stepsize = range / 255.0f;
-
-  const uint8_t *in = in_tensor.data<uint8_t>();
-  float *out = out_tensor->mutable_data<float>();
-
-  for (int i = 0; i < out_tensor->size(); ++i) {
-    out[i] = (in[i] * stepsize) + min_in;
-  }
-}
-
-
-}  // namespace mace
diff --git a/mace/core/runtime/hexagon/quantize.h b/mace/core/runtime/hexagon/quantize.h
deleted file mode 100644
index f121b0d07448d9c53070d25c74aaa91a8cde7015..0000000000000000000000000000000000000000
--- a/mace/core/runtime/hexagon/quantize.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
-#define MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
-
-#include "mace/core/tensor.h"
-
-namespace mace {
-
-class Quantizer {
- public:
-  Quantizer() {}
-  ~Quantizer() {}
-
-  void Quantize(const Tensor &in_tensor,
-                Tensor *out_tensor,
-                float *min_out,
-                float *max_out);
-  void Quantize(const Tensor &in_tensor,
-                const float min_in,
-                const float max_in,
-                Tensor *out_tensor,
-                float *min_out,
-                float *max_out);
-  void DeQuantize(const Tensor &in_tensor,
-                  const float min_in,
-                  const float max_in,
-                  Tensor *out_tensor);
-
- private:
-  void QuantizeAdjustRange(float min_in,
-                           float max_in,
-                           float *min_out,
-                           float *max_out,
-                           float *stepsize,
-                           float *recip_stepsize);
-
-  MACE_DISABLE_COPY_AND_ASSIGN(Quantizer);
-};
-
-}  // namespace mace
-
-#endif  // MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 31cd5a541ee809d53f065e9ef63c67d819963c5f..0a5f9460f1026670224dfa28738cca15486a206e 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -24,7 +24,7 @@
 #include <vector>
 #include <utility>
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/core/kv_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
 #include "mace/utils/tuner.h"
@@ -273,7 +273,7 @@ OpenCLRuntime::OpenCLRuntime(
     gpu_type_(UNKNOWN) {
   std::vector<cl::Platform> all_platforms;
   cl::Platform::get(&all_platforms);
-  if (all_platforms.size() == 0) {
+  if (all_platforms.empty()) {
     LOG(ERROR) << "No OpenCL platforms found";
     return;
   }
@@ -289,7 +289,7 @@ OpenCLRuntime::OpenCLRuntime(
   // get default device (CPUs, GPUs) of the default platform
   std::vector<cl::Device> all_devices;
   default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
-  if (all_devices.size() == 0) {
+  if (all_devices.empty()) {
     LOG(ERROR) << "No OpenCL devices found";
     return;
   }
diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc
index b190f05f4f258c27aabc0f209e271572257fb4f3..ca11414668d6e95f3d6bd70a13f48a312ea1c616 100644
--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -17,6 +17,7 @@
 #include <utility>
 
 #include "mace/utils/logging.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
@@ -151,8 +152,9 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
     const std::vector<mace::index_t> &input_shape,
     const std::string &output_name,
     const mace::DataType dt,
+    const OpenCLBufferType buffer_type,
     const mace::MemoryType mem_type,
-    const DataFormat data_format) {
+    bool has_data_format) {
   std::unique_ptr<OperatorDef> op(new OperatorDef);
   std::string op_name = "mace_node_" + output_name;
   op->set_name(op_name);
@@ -161,7 +163,7 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
   op->add_output(output_name);
   Argument *arg = op->add_arg();
   arg->set_name("buffer_type");
-  arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
+  arg->set_i(static_cast<int32_t>(buffer_type));
   arg = op->add_arg();
   arg->set_name("mem_type");
   arg->set_i(static_cast<int32_t>(mem_type));
@@ -169,8 +171,8 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
   arg->set_name("T");
   arg->set_i(static_cast<int32_t>(dt));
   arg = op->add_arg();
-  arg->set_name("data_format");
-  arg->set_i(data_format);
+  arg->set_name("has_data_format");
+  arg->set_i(has_data_format);
   if (!input_shape.empty()) {
     OutputShape *shape = op->add_output_shape();
     for (auto value : input_shape) {
diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h
index ec399d87600dc9529c9d94f909ab6d45cd6f4a3e..ea0e239ee17c6826f23a73412ebc0a71d6dd25cf 100644
--- a/mace/core/runtime/opencl/opencl_util.h
+++ b/mace/core/runtime/opencl/opencl_util.h
@@ -48,8 +48,9 @@ class OpenCLUtil {
       const std::vector<mace::index_t> &input_shape,
       const std::string &output_name,
       const mace::DataType dt,
+      const OpenCLBufferType buffer_type,
       const MemoryType mem_type,
-      const DataFormat data_format);
+      bool has_data_format);
 };
 
 }  // namespace mace
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index ae999b05df7b7cc1df91cf4a716ea1b48da1b7e8..dc6c8f62d09cf52d2149c18e0ff9239856cbc2ac 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -97,8 +97,6 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 }
 }  // namespace numerical_chars
 
-enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
-
 class Tensor {
  public:
   Tensor(Allocator *alloc, DataType type,
@@ -304,10 +302,14 @@ class Tensor {
     if (buffer_ != nullptr) {
       MACE_CHECK(!has_opencl_image(),
                  name_, ": Cannot resize image, use ResizeImage.");
-      if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+      const index_t apply_size = raw_size()
+          + ((buffer_ != &buffer_slice_) ? MACE_EXTRA_BUFFER_PAD_SIZE : 0);
+      if (apply_size > buffer_->size()) {
         LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
-                     << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
-        return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
+                     << " to " << apply_size;
+        MACE_CHECK(buffer_ != &buffer_slice_,
+                   ": Cannot resize tensor with buffer slice");
+        return buffer_->Resize(apply_size);
       }
       return MaceStatus::MACE_SUCCESS;
     } else {
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 03442869230066d081bef599d74d277283d386f0..a7cd149579bc6d6bf875a7b993010d6243a4d49a 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index eb10dc89bed268fc1bd8d5772e5acac551c90d0e..8009fda180a7d186ec9e27b0c0751cd34eeb0a11 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -68,7 +68,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
   if (tensor_map_.count(name)) {
     return tensor_map_.at(name).get();
   } else {
-    LOG(WARNING) << "Tensor " << name << " does not exist.";
+    VLOG(1) << "Tensor " << name << " does not exist.";
   }
   return nullptr;
 }
@@ -264,31 +264,35 @@ MaceStatus Workspace::PreallocateOutputTensor(
   bool is_quantize_model = IsQuantizedModel(net_def);
   for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
     std::unique_ptr<Tensor> tensor
-        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
-                    tensor_mem.second.second,
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
+                    tensor_mem.second.data_type,
                     false, tensor_mem.first));
-    if (mem_blocks[tensor_mem.second.first].mem_type()
-        == MemoryType::GPU_IMAGE) {
-      VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Mem: " << tensor_mem.second.first
-              << " Data type: " << tensor->dtype()
-              << " Image shape: "
-              << tensor->UnderlyingBuffer()->shape()[0]
-              << ", "
-              << tensor->UnderlyingBuffer()->shape()[1];
-     tensor->set_data_format(DataFormat::NHWC);
-    } else {
-      VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Mem: " << tensor_mem.second.first
-              << " Data type: " << tensor->dtype()
-              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-      if (mem_blocks[tensor_mem.second.first].mem_type()
-          == MemoryType::GPU_BUFFER ||
-          is_quantize_model) {
+    if (tensor_mem.second.has_data_format) {
+      if (mem_blocks[tensor_mem.second.mem_id].mem_type()
+          == MemoryType::GPU_IMAGE) {
+        VLOG(1) << "Tensor: " << tensor_mem.first
+                << " Mem: " << tensor_mem.second.mem_id
+                << " Data type: " << tensor->dtype()
+                << " Image shape: "
+                << tensor->UnderlyingBuffer()->shape()[0]
+                << ", "
+                << tensor->UnderlyingBuffer()->shape()[1];
         tensor->set_data_format(DataFormat::NHWC);
       } else {
-        tensor->set_data_format(DataFormat::NCHW);
+        VLOG(1) << "Tensor: " << tensor_mem.first
+                << " Mem: " << tensor_mem.second.mem_id
+                << " Data type: " << tensor->dtype()
+                << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+        if (mem_blocks[tensor_mem.second.mem_id].mem_type()
+            == MemoryType::GPU_BUFFER ||
+            is_quantize_model) {
+          tensor->set_data_format(DataFormat::NHWC);
+        } else {
+          tensor->set_data_format(DataFormat::NCHW);
+        }
       }
+    } else {
+      tensor->set_data_format(DataFormat::DF_NONE);
     }
     tensor_map_[tensor_mem.first] = std::move(tensor);
   }
diff --git a/mace/examples/android/README.md b/mace/examples/android/README.md
index 5d2154901a7bd7e270fa67ec5eeaa818459b8d9c..d94a51367b3443d119515348a2111737c492dcad 100644
--- a/mace/examples/android/README.md
+++ b/mace/examples/android/README.md
@@ -5,7 +5,7 @@ How to build
 ---------------
 
 ```sh
-cd mace/exampls/android
+cd mace/examples/android
 ./build.sh dynamic
 # if libmace.a is needed, update `macelibrary/CMakeLists.txt` and run with `./build.sh static`.
 ```
diff --git a/mace/examples/cli/BUILD b/mace/examples/cli/BUILD.bazel
similarity index 91%
rename from mace/examples/cli/BUILD
rename to mace/examples/cli/BUILD.bazel
index 97e42b7df148e94bd11ab0d1f3cd7bc5470e3fd2..693009e37f0a5a49fc1ca4ffab771c67de25b7c5 100644
--- a/mace/examples/cli/BUILD
+++ b/mace/examples/cli/BUILD.bazel
@@ -3,6 +3,7 @@ load(
     "//mace:mace.bzl",
     "if_android",
     "if_hexagon_enabled",
+    "if_hta_enabled",
     "if_opencl_enabled",
     "if_openmp_enabled",
 )
@@ -33,8 +34,11 @@ cc_binary(
         "//mace/codegen:generated_libmace",
         "//mace/codegen:generated_opencl_binary",
         "//mace/codegen:generated_opencl_parameter",
+        "//mace/utils:utils_hdrs",
     ] + if_hexagon_enabled([
         "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
     ]),
 )
 
@@ -63,5 +67,6 @@ cc_binary(
         "//mace/codegen:generated_mace_engine_factory",
         "//mace/codegen:generated_opencl_binary",
         "//mace/codegen:generated_opencl_parameter",
+        "//mace/utils:utils_hdrs",
     ],
 )
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 97d3608116914423be21b05f02307b64a850eabd..26f615d132421011207429be6cffc516751863bb 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <malloc.h>
 #include <sys/mman.h>
@@ -26,7 +27,11 @@
 #include <numeric>
 
 #include "gflags/gflags.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/string_util.h"
 // if convert model to code.
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -45,97 +50,6 @@ size_t OpenCLParameterSize();
 namespace mace {
 namespace examples {
 
-namespace str_util {
-
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-
-}  // namespace str_util
-
-namespace {
-bool ReadBinaryFile(std::vector<unsigned char> *data,
-                           const std::string &filename) {
-  std::ifstream ifs(filename, std::ios::in | std::ios::binary);
-  if (!ifs.is_open()) {
-    return false;
-  }
-  ifs.seekg(0, ifs.end);
-  size_t length = ifs.tellg();
-  ifs.seekg(0, ifs.beg);
-
-  data->reserve(length);
-  data->insert(data->begin(), std::istreambuf_iterator<char>(ifs),
-               std::istreambuf_iterator<char>());
-  if (ifs.fail()) {
-    return false;
-  }
-  ifs.close();
-
-  return true;
-}
-
-bool MemoryMap(const std::string &file,
-               const unsigned char **data,
-               size_t *size) {
-  bool ret = true;
-  int fd = open(file.c_str(), O_RDONLY);
-  if (fd < 0) {
-    std::cerr << "Failed to open file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-  struct stat st;
-  fstat(fd, &st);
-  *size = static_cast<size_t>(st.st_size);
-
-  *data = static_cast<const unsigned char *>(
-      mmap(nullptr, *size, PROT_READ, MAP_PRIVATE, fd, 0));
-  if (*data == static_cast<const unsigned char *>(MAP_FAILED)) {
-    std::cerr << "Failed to map file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-
-  if (close(fd) < 0) {
-    std::cerr << "Failed to close file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-
-  return ret;
-}
-
-bool MemoryUnMap(const unsigned char *data,
-                 const size_t &size) {
-  bool ret = true;
-  if (data == nullptr || size == 0) {
-    std::cerr << "data is null or size is 0" << std::endl;
-    ret = false;
-  }
-
-  if (munmap(const_cast<unsigned char *>(data), size) < 0) {
-    std::cerr << "Failed to unmap file, error code: "
-              << strerror(errno) << std::endl;
-    ret = false;
-  }
-
-  return ret;
-}
-
-}  // namespace
-
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   std::string tmp = str;
   while (!tmp.empty()) {
@@ -165,11 +79,24 @@ DeviceType ParseDeviceType(const std::string &device_str) {
     return DeviceType::GPU;
   } else if (device_str.compare("HEXAGON") == 0) {
     return DeviceType::HEXAGON;
+  } else if (device_str.compare("HTA") == 0) {
+    return DeviceType::HTA;
   } else {
     return DeviceType::CPU;
   }
 }
 
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
 
 DEFINE_string(model_name,
               "",
@@ -186,6 +113,12 @@ DEFINE_string(output_node,
 DEFINE_string(output_shape,
               "1,224,224,2:1,1,1,10",
               "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file,
               "",
               "input file name | input file prefix for multiple inputs.");
@@ -222,8 +155,10 @@ DEFINE_int32(cpu_affinity_policy, 1,
 
 bool RunModel(const std::vector<std::string> &input_names,
               const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
               const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes) {
+              const std::vector<std::vector<int64_t>> &output_shapes,
+              const std::vector<DataFormat> &output_data_formats) {
   // load model
   DeviceType device_type = ParseDeviceType(FLAGS_device);
   // configuration
@@ -266,16 +201,26 @@ bool RunModel(const std::vector<std::string> &input_names,
   std::shared_ptr<mace::MaceEngine> engine;
   MaceStatus create_engine_status;
 
-  std::vector<unsigned char> model_graph_data;
-  if (!ReadBinaryFile(&model_graph_data, FLAGS_model_file)) {
-    std::cerr << "Failed to read file: " << FLAGS_model_file << std::endl;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data;
+  if (FLAGS_model_file != "") {
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
+        &model_graph_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
+    }
   }
-  const unsigned char *model_weights_data = nullptr;
-  size_t model_weights_data_size = 0;
-  if (!MemoryMap(FLAGS_model_data_file,
-                 &model_weights_data,
-                 &model_weights_data_size)) {
-    std::cerr << "Failed to read file: " << FLAGS_model_data_file << std::endl;
+
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data;
+  if (FLAGS_model_data_file != "") {
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(
+        FLAGS_model_data_file.c_str(),
+        &model_weights_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
+    }
+    MACE_CHECK(model_weights_data->length() > 0);
   }
 
   // Only choose one of the two type based on the `model_graph_format`
@@ -283,24 +228,24 @@ bool RunModel(const std::vector<std::string> &input_names,
 #ifdef MODEL_GRAPH_FORMAT_CODE
   // if model_data_format == code, just pass an empty string("")
   // to model_data_file parameter.
-  create_engine_status =
-      CreateMaceEngineFromCode(FLAGS_model_name,
-                               model_weights_data,
-                               model_weights_data_size,
-                               input_names,
-                               output_names,
-                               config,
-                               &engine);
+  create_engine_status = CreateMaceEngineFromCode(
+      FLAGS_model_name,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
+      model_weights_data->length(),
+      input_names,
+      output_names,
+      config,
+      &engine);
 #else
-  create_engine_status =
-      CreateMaceEngineFromProto(model_graph_data.data(),
-                                model_graph_data.size(),
-                                model_weights_data,
-                                model_weights_data_size,
-                                input_names,
-                                output_names,
-                                config,
-                                &engine);
+  create_engine_status = CreateMaceEngineFromProto(
+      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
+      model_graph_data->length(),
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
+      model_weights_data->length(),
+      input_names,
+      output_names,
+      config,
+      &engine);
 #endif
 
   if (create_engine_status != MaceStatus::MACE_SUCCESS) {
@@ -324,7 +269,8 @@ bool RunModel(const std::vector<std::string> &input_names,
     inputs_size[input_names[i]] = input_size;
     auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                             std::default_delete<float[]>());
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
+        input_data_formats[i]);
   }
 
   for (size_t i = 0; i < output_count; ++i) {
@@ -333,7 +279,8 @@ bool RunModel(const std::vector<std::string> &input_names,
                         std::multiplies<int64_t>());
     auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                              std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
+        output_data_formats[i]);
   }
 
   if (!FLAGS_input_dir.empty()) {
@@ -430,10 +377,6 @@ bool RunModel(const std::vector<std::string> &input_names,
     }
   }
 
-  if (model_weights_data != nullptr) {
-    MemoryUnMap(model_weights_data, model_weights_data_size);
-  }
-
   std::cout << "Finished" << std::endl;
 
   return true;
@@ -466,13 +409,10 @@ int Main(int argc, char **argv) {
             << FLAGS_cpu_affinity_policy
             << std::endl;
 
-  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names =
-      str_util::Split(FLAGS_output_node, ',');
-  std::vector<std::string> input_shapes =
-      str_util::Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes =
-      str_util::Split(FLAGS_output_shape, ':');
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
 
   const size_t input_count = input_shapes.size();
   const size_t output_count = output_shapes.size();
@@ -485,11 +425,25 @@ int Main(int argc, char **argv) {
     ParseShape(output_shapes[i], &output_shape_vec[i]);
   }
 
+  std::vector<std::string> raw_input_data_formats =
+    Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+    Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+
   bool ret = false;
   for (int i = 0; i < FLAGS_restart_round; ++i) {
     std::cout << "restart round " << i << std::endl;
     ret =
-        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+        RunModel(input_names, input_shape_vec, input_data_formats,
+                 output_names, output_shape_vec, output_data_formats);
   }
   if (ret) {
     return 0;
diff --git a/mace/libmace/BUILD b/mace/libmace/BUILD.bazel
similarity index 65%
rename from mace/libmace/BUILD
rename to mace/libmace/BUILD.bazel
index 1cecc7f60f86ca15904d40eb57188a2e42a83006..36eff0c80a76c3adb0b9e8738281974bf1aa2280 100644
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD.bazel
@@ -10,13 +10,14 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
+    "if_linux",
+    "if_darwin",
     "if_neon_enabled",
-    "if_neon_enabled_str",
     "if_openmp_enabled",
     "if_android_armv7",
     "if_hexagon_enabled",
+    "if_hta_enabled",
     "if_opencl_enabled",
-    "if_opencl_enabled_str",
     "if_quantize_enabled",
 )
 
@@ -40,6 +41,8 @@ cc_library(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     deps = [
         "//mace/ops",
@@ -77,6 +80,7 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+# For details, see https://github.com/bazelbuild/bazel/issues/5200
 genrule(
     name = "libmace_static",
     srcs = [
@@ -87,10 +91,19 @@ genrule(
         "//mace/ops:internal_ops",
         "//mace/ops",
         "//mace/libmace",
+        "//mace/port:port_base",
+        "//mace/port/posix:port_posix",
+        "//mace/public",
         "//mace/utils",
         "//mace/proto:mace_cc",
         "@com_google_protobuf//:protobuf_lite",
-    ] + if_opencl_enabled([
+    ] + if_android([
+        "//mace/port/android:port_android",
+    ]) + if_linux([
+        "//mace/port/linux:port_linux",
+    ]) + if_darwin([
+        "//mace/port/darwin:port_darwin",
+    ]) + if_opencl_enabled([
         "//mace/ops:opencl_kernels",
         "//mace/codegen:generated_opencl",
     ]) + if_neon_enabled([
@@ -103,20 +116,44 @@ genrule(
           "$(locations //mace/core:core) " +
           "$(locations //mace/ops:common) " +
           "$(locations //mace/ops:ref_kernels) " +
-          if_neon_enabled_str("$(locations //mace/ops:arm_neon_kernels) ") +
-          if_opencl_enabled_str("$(locations //mace/ops:opencl_kernels) ") +
+          if_neon_enabled(
+              "$(locations //mace/ops:arm_neon_kernels) ",
+              default_value = "",
+          ) +
+          if_opencl_enabled(
+              "$(locations //mace/ops:opencl_kernels) ",
+              default_value = "",
+          ) +
           "$(locations //mace/ops:internal_ops) " +
           "$(locations //mace/ops:ops) " +
           "$(locations //mace/libmace:libmace) " +
+          "$(locations //mace/port:port_base) " +
+          "$(locations //mace/port/posix:port_posix) " +
+          if_android(
+              "$(locations //mace/port/android:port_android) ",
+              default_value = "",
+          ) +
+          if_linux(
+              "$(locations //mace/port/linux:port_linux) ",
+              default_value = "",
+          ) +
+          if_darwin(
+              "$(locations //mace/port/darwin:port_darwin) ",
+              default_value = "",
+          ) +
+          "$(locations //mace/public:public) " +
           "$(locations //mace/utils:utils) " +
           "$(locations //mace/proto:mace_cc) " +
           "$(locations @com_google_protobuf//:protobuf_lite) " +
-          if_opencl_enabled_str("$(locations //mace/codegen:generated_opencl) ") +
+          if_opencl_enabled(
+              "$(locations //mace/codegen:generated_opencl) ",
+              default_value = "",
+          ) +
           "$@ " +
           "$$tmp_mri_file);" +
           "$(AR) -M <$$tmp_mri_file;" +
-          "rm -rf $$tmp_mri_file;" +
-          "$(STRIP) -x $@;",
+          "rm -rf $$tmp_mri_file;",
+    # "$(STRIP) -x $@;",  # FIXME this will crash
     tools = ["//mace/python/tools:archive_static_lib"],
     visibility = ["//visibility:public"],
 )
diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc
index 2989cbc16f8432842858af66e7682678d7a09f2f..d37a62b6616b03bc476e7549b4e1b5d73357148d 100644
--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -142,14 +142,15 @@ void BMNet::SetUp() {
 
   // Add input and output information
   for (size_t i = 0; i < input_names_.size(); ++i) {
-    InputInfo *info = net_.add_input_info();
+    InputOutputInfo *info = net_.add_input_info();
+    info->set_data_format(DataFormat::NHWC);
     info->set_name(input_names_[i]);
     for (auto d : input_shapes_[i]) {
       info->add_dims(static_cast<int>(d));
     }
   }
   for (auto output_name : output_names_) {
-    OutputInfo *info = net_.add_output_info();
+    InputOutputInfo *info = net_.add_output_info();
     info->set_name(output_name);
   }
   // allocate weight data
@@ -243,8 +244,8 @@ void BMNet::AddConv(const std::string &conv_type,
   op_def->add_output(output_name);
   AddIntsArg(op_def, "strides", strides);
   AddIntArg(op_def, "padding", padding_type);
+  AddIntArg(op_def, "has_data_format", 1);
   AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
   if (has_relu6) {
     AddStringArg(op_def, "activation", "RELUX");
     AddFloatArg(op_def, "max_limit", 6);
@@ -270,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
   op_def->add_output(output);
   AddIntArg(op_def, "type", type);
   AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
+  AddIntArg(op_def, "has_data_format", 1);
   OutputShape *shape = op_def->add_output_shape();
   for (auto dim : output_shape) {
     shape->add_dims(dim);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index ce8a1cc77af08e027c91ed5c57e3b49a55ba7ada..927930fec485769b44a9df48284af3940034d9da 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -21,17 +21,21 @@
 #include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
 #include "mace/ops/common/transpose.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/stl_util.h"
 #include "mace/public/mace.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
 
-#ifdef MACE_ENABLE_HEXAGON
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
 #include "mace/core/runtime/hexagon/hexagon_device.h"
-#endif  // MACE_ENABLE_HEXAGON
+#endif
 
 namespace mace {
 namespace {
@@ -289,7 +293,10 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
                        std::shared_ptr<float> data,
                        const DataFormat format) {
   MACE_CHECK_NOTNULL(data.get());
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  MACE_CHECK(format == DataFormat::NHWC || format == DataFormat::NCHW
+                 || format == OIHW,
+             "MACE only support NHWC, NCHW and OIHW formats of input now.");
+  impl_ = make_unique<MaceTensor::Impl>();
   impl_->shape = shape;
   impl_->data = data;
   impl_->format = format;
@@ -298,11 +305,11 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
 }
 
 MaceTensor::MaceTensor() {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
 }
 
 MaceTensor::MaceTensor(const MaceTensor &other) {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
   impl_->shape = other.shape();
   impl_->data = other.data();
   impl_->format = other.data_format();
@@ -310,7 +317,7 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
 }
 
 MaceTensor::MaceTensor(const MaceTensor &&other) {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
   impl_->shape = other.shape();
   impl_->data = other.data();
   impl_->format = other.data_format();
@@ -375,33 +382,31 @@ class MaceEngine::Impl {
                              std::pair<const std::string, MaceTensor> *output);
 
  private:
-  const unsigned char *model_data_;
-  size_t model_data_size_;
+  std::unique_ptr<port::ReadOnlyMemoryRegion> model_data_;
   std::unique_ptr<OpRegistryBase> op_registry_;
   DeviceType device_type_;
   std::unique_ptr<Device> device_;
   std::unique_ptr<Workspace> ws_;
   std::unique_ptr<NetBase> net_;
   bool is_quantized_model_;
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
-  std::map<std::string, mace::InputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  std::map<std::string, mace::InputOutputInfo> input_info_map_;
+  std::map<std::string, mace::InputOutputInfo> output_info_map_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
 
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
-      model_data_size_(0),
       op_registry_(new OpRegistry),
       device_type_(config.impl_->device_type()),
       device_(nullptr),
       ws_(new Workspace()),
       net_(nullptr),
       is_quantized_model_(false)
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
       , hexagon_controller_(nullptr)
 #endif
 {
@@ -424,9 +429,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
         config.impl_->use_gemmlowp()));
   }
 #endif
-#ifdef MACE_ENABLE_HEXAGON
-  if (device_type_ == DeviceType::HEXAGON) {
-    device_.reset(new HexagonDevice());
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+  if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) {
+    device_.reset(new HexagonDevice(device_type_));
   }
 #endif
   MACE_CHECK_NOTNULL(device_);
@@ -468,6 +473,9 @@ MaceStatus MaceEngine::Impl::Init(
       shape[i] = input_info_map_[input_name].dims(i);
     }
     input_tensor->Resize(shape);
+    // Set to the default data format
+    input_tensor->set_data_format(static_cast<DataFormat>(
+        input_info_map_[input_name].data_format()));
   }
   for (auto output_name : output_nodes) {
     if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -475,15 +483,17 @@ MaceStatus MaceEngine::Impl::Init(
                  << "' does not belong to model's outputs "
                  << MakeString(MapKeys(output_info_map_));
     }
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
     ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+#endif
   }
-#ifdef MACE_ENABLE_HEXAGON
-  if (device_type_ == HEXAGON) {
-    hexagon_controller_.reset(new HexagonControlWrapper());
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
+    hexagon_controller_ = CreateHexagonControlWrapper(device_type_);
     MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
     hexagon_controller_->SetDebugLevel(
-        static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
+        static_cast<int>(mace::port::MinVLogLevelFromEnv()));
     MACE_CHECK(hexagon_controller_->SetupGraph(*net_def, model_data),
                "hexagon setup graph error");
     if (VLOG_IS_ON(2)) {
@@ -511,7 +521,7 @@ MaceStatus MaceEngine::Impl::Init(
       ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
     }
     MACE_RETURN_IF_ERROR(net_->Init());
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   }
 #endif
 
@@ -525,25 +535,25 @@ MaceStatus MaceEngine::Impl::Init(
     const std::string &model_data_file) {
   LOG(INFO) << "Loading Model Data";
 
-  MemoryMap(model_data_file, &model_data_, &model_data_size_);
+  auto fs = GetFileSystem();
+  MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
+        model_data_file.c_str(), &model_data_));
 
-  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));
+  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes,
+        reinterpret_cast<const unsigned char *>(model_data_->data())));
 
   if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
+      device_type_ == DeviceType::HTA ||
       (device_type_ == DeviceType::CPU && ws_->diffused_buffer())) {
-    MemoryUnMap(model_data_, model_data_size_);
-    model_data_ = nullptr;
+    model_data_.reset();
   }
   return MaceStatus::MACE_SUCCESS;
 }
 
 MaceEngine::Impl::~Impl() {
   LOG(INFO) << "Destroying MaceEngine";
-  if (model_data_ != nullptr) {
-    MemoryUnMap(model_data_, model_data_size_);
-  }
-#ifdef MACE_ENABLE_HEXAGON
-  if (device_type_ == HEXAGON) {
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
     if (VLOG_IS_ON(2)) {
       hexagon_controller_->GetPerfInfo();
       hexagon_controller_->PrintLog();
@@ -557,47 +567,51 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
     const std::pair<const std::string, MaceTensor> &input,
     Tensor *input_tensor) {
-  if (device_->device_type() == DeviceType::CPU &&
-      input.second.shape().size() == 4 &&
-      input.second.data_format() == NHWC &&
-      !is_quantized_model_) {
-    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
-    input_tensor->set_data_format(DataFormat::NCHW);
-    std::vector<int> dst_dims = {0, 3, 1, 2};
-    std::vector<index_t> output_shape =
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    return ops::Transpose(input.second.data().get(),
-                          input.second.shape(),
-                          dst_dims,
-                          input_data);
-  } else if (
-      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
-      input.second.shape().size() == 4 &&
-      input.second.data_format() == DataFormat::NCHW) {
-    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-    std::vector<int> dst_dims = {0, 2, 3, 1};
-    input_tensor->set_data_format(DataFormat::NHWC);
-    std::vector<index_t> output_shape =
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    return ops::Transpose(input.second.data().get(),
-                          input.second.shape(),
-                          dst_dims,
-                          input_data);
-  } else {
-    input_tensor->set_data_format(input.second.data_format());
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    memcpy(input_data, input.second.data().get(),
-           input_tensor->size() * sizeof(float));
-    return MaceStatus::MACE_SUCCESS;
+  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
+  DataFormat data_format = DataFormat::DF_NONE;
+  if (has_data_format) {
+    if (device_->device_type() == DeviceType::CPU &&
+        input.second.shape().size() == 4 &&
+        input.second.data_format() == NHWC &&
+        !is_quantized_model_) {
+      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+      input_tensor->set_data_format(DataFormat::NCHW);
+      std::vector<int> dst_dims = {0, 3, 1, 2};
+      std::vector<index_t> output_shape =
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      Tensor::MappingGuard input_guard(input_tensor);
+      float *input_data = input_tensor->mutable_data<float>();
+      return ops::Transpose(input.second.data().get(),
+                            input.second.shape(),
+                            dst_dims,
+                            input_data);
+    } else if (
+        (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+            input.second.shape().size() == 4 &&
+            input.second.data_format() == DataFormat::NCHW) {
+      VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      input_tensor->set_data_format(DataFormat::NHWC);
+      std::vector<index_t> output_shape =
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      Tensor::MappingGuard input_guard(input_tensor);
+      float *input_data = input_tensor->mutable_data<float>();
+      return ops::Transpose(input.second.data().get(),
+                            input.second.shape(),
+                            dst_dims,
+                            input_data);
+    }
+    data_format = input.second.data_format();
   }
+  input_tensor->set_data_format(data_format);
+  MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+  Tensor::MappingGuard input_guard(input_tensor);
+  float *input_data = input_tensor->mutable_data<float>();
+  memcpy(input_data, input.second.data().get(),
+         input_tensor->size() * sizeof(float));
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus MaceEngine::Impl::TransposeOutput(
@@ -605,38 +619,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
     std::pair<const std::string, mace::MaceTensor> *output) {
   // save output
   if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (device_->device_type() == DeviceType::CPU &&
-        output->second.shape().size() == 4 &&
-        output->second.data_format() != output_tensor->data_format()) {
-      MACE_CHECK(output_tensor->data_format() == NCHW);
-      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
-      std::vector<index_t> shape =
-          TransposeShape<index_t, index_t>(output_tensor->shape(),
-                                           dst_dims);
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(output_size <= output->second.impl_->buffer_size)
-        << "Output size exceeds buffer size: shape"
-        << MakeString<int64_t>(shape) << " vs buffer size "
-        << output->second.impl_->buffer_size;
-      output->second.impl_->shape = shape;
-      Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
-    } else if (device_->device_type() == DeviceType::GPU &&
+    if (output_tensor->data_format() != DataFormat::DF_NONE &&
+        output->second.data_format() != DataFormat::DF_NONE &&
         output->second.shape().size() == 4 &&
         output->second.data_format() != output_tensor->data_format()) {
       VLOG(1) << "Transform output " << output->first << " from "
               << output_tensor->data_format() << " to "
               << output->second.data_format();
-      std::vector<int> dst_dims = {0, 3, 1, 2};
-      if (output_tensor->data_format() == NCHW) {
+      std::vector<int> dst_dims;
+      if (output_tensor->data_format() == NCHW &&
+          output->second.data_format() == NHWC) {
         dst_dims = {0, 2, 3, 1};
+      } else if (output_tensor->data_format() == NHWC &&
+          output->second.data_format() == NCHW) {
+        dst_dims = {0, 3, 1, 2};
+      } else {
+        LOG(FATAL) <<"Not supported output data format: "
+                   << output->second.data_format() << " vs "
+                   << output_tensor->data_format();
       }
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
       std::vector<index_t> shape =
           TransposeShape<index_t, index_t>(output_tensor->shape(),
                                            dst_dims);
@@ -698,15 +702,15 @@ MaceStatus MaceEngine::Impl::Run(
     Tensor *output_tensor = ws_->GetTensor(output.first);
     output_tensors.push_back(output_tensor);
   }
-#ifdef MACE_ENABLE_HEXAGON
-  if (device_type_ == HEXAGON) {
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
     MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1,
                "HEXAGON not support multiple inputs and outputs yet.");
     hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors);
   } else {
 #endif
     MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   }
 #endif
 
@@ -725,7 +729,7 @@ MaceStatus MaceEngine::Impl::Run(
 }
 
 MaceEngine::MaceEngine(const MaceEngineConfig &config):
-    impl_(new MaceEngine::Impl(config)) {}
+    impl_(make_unique<MaceEngine::Impl>(config)) {}
 
 MaceEngine::~MaceEngine() = default;
 
diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds
index 9b7d34538ad20417e59051420048e98998c5afd7..a088736de4d1e6c0ab07a397ae5d4164689726b7 100644
--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -7,19 +7,20 @@ mace {
     *CreateMaceEngineFromProto*;
     *GetBigLittleCoreIDs*;
     *MaceVersion*;
+    *GetCapability*;
 
     # api for static library of models
-    *mace*logging*LogMessage*;
+    *mace*port**;
     *mace*MaceStatus*;
     *mace*NetDef*;
     *mace*MemoryType*;
     *mace*DataType*;
-    *mace*InputInfo*;
-    *mace*OutputInfo*;
+    *mace*InputOutputInfo*;
     *mace*OutputShape*;
     *mace*OperatorDef*;
     *mace*ConstTensor*;
     *mace*Argument*;
+    *mace*Split*;
     *mace*MemoryBlock*;
     *google*protobuf*;
 
diff --git a/mace/mace.bzl b/mace/mace.bzl
index 2afe4560e323d2ad1cbe731832c5a918b09b177b..1f577e7e47d02f6ce23391205110687b49d1efdf 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -1,15 +1,21 @@
 # -*- Python -*-
 
-def if_android(a):
+def if_android(a, default_value = []):
   return select({
       "//mace:android": a,
-      "//conditions:default": [],
+      "//conditions:default": default_value,
   })
 
-def if_not_android(a):
+def if_linux(a, default_value = []):
   return select({
-      "//mace:android": [],
-      "//conditions:default": a,
+      "//mace:linux": a,
+      "//conditions:default": default_value,
+  })
+
+def if_darwin(a, default_value = []):
+  return select({
+      "//mace:darwin": a,
+      "//conditions:default": default_value,
   })
 
 def if_android_armv7(a):
@@ -36,16 +42,10 @@ def if_arm_linux_armhf(a):
       "//conditions:default": []
   })
 
-def if_neon_enabled(a):
-  return select({
-      "//mace:neon_enabled": a,
-      "//conditions:default": [],
-  })
-
-def if_neon_enabled_str(a):
+def if_neon_enabled(a, default_value = []):
   return select({
       "//mace:neon_enabled": a,
-      "//conditions:default": "",
+      "//conditions:default": default_value,
   })
 
 def if_hexagon_enabled(a):
@@ -60,22 +60,29 @@ def if_not_hexagon_enabled(a):
       "//conditions:default": a,
   })
 
-def if_openmp_enabled(a):
+def if_hta_enabled(a):
   return select({
-      "//mace:openmp_enabled": a,
+      "//mace:hta_enabled": a,
       "//conditions:default": [],
   })
 
-def if_opencl_enabled(a):
+def if_hexagon_or_hta_enabled(a):
   return select({
-      "//mace:opencl_enabled": a,
+      "//mace:hexagon_enabled": a,
+      "//mace:hta_enabled": a,
+      "//conditions:default": [],
+  })
+
+def if_openmp_enabled(a):
+  return select({
+      "//mace:openmp_enabled": a,
       "//conditions:default": [],
   })
 
-def if_opencl_enabled_str(a):
+def if_opencl_enabled(a, default_value = []):
   return select({
       "//mace:opencl_enabled": a,
-      "//conditions:default": "",
+      "//conditions:default": default_value,
   })
 
 def if_quantize_enabled(a):
diff --git a/mace/ops/BUILD b/mace/ops/BUILD.bazel
similarity index 94%
rename from mace/ops/BUILD
rename to mace/ops/BUILD.bazel
index 7f03ce12221a7e074e59a34cdb38f918b86ff51a..bbf5f34822b734eb6555702cc219454bcf4ec051 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD.bazel
@@ -54,37 +54,17 @@ cc_library(
 
 cc_library(
     name = "testing",
-    srcs = glob(
-        [
-            "testing/*.cc",
-        ],
-    ),
-    hdrs = glob(
-        [
-            "testing/*.h",
-        ],
-    ),
+    hdrs = [
+            "testing/test_utils.h",
+    ],
     copts = [
         "-Werror",
         "-Wextra",
         "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_quantize_enabled([
-        "-DMACE_ENABLE_QUANTIZE",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
+    ],
     deps = [
         "//mace/core",
-        "@gtest//:gtest",
+        "@gtest",
     ],
 )
 
@@ -254,7 +234,7 @@ cc_library(
         ":arm_neon_kernels",
         ":ref_kernels",
         ":testing",
-        "@gtest//:gtest",
+        "@gtest",
     ],
     alwayslink = 1,
 )
@@ -289,7 +269,7 @@ cc_library(
         ":opencl_kernels",
         ":ref_kernels",
         ":testing",
-        "@gtest//:gtest",
+        "@gtest",
     ],
     alwayslink = 1,
 )
@@ -329,12 +309,12 @@ cc_library(
             "ops_registry.h",
             "ops_test_util.h",
             "fixpoint.h",
-            "gemmlowp_util.h",
+            "common/gemmlowp_util.h",
             "quantization_util.h",
         ],
     ) + if_quantize_enabled(glob([
         "fixpoint.h",
-        "gemmlowp_util.h",
+        "common/gemmlowp_util.h",
         "quantization_util.h",
     ])),
     copts = [
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index a9e28f1e2b08d985f657d3fa10a9a431a542c9e1..29fee227df0ebac83d9a2e8c9a275a62aff8c68a 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -22,6 +22,7 @@
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -88,9 +89,8 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(
-          new opencl::image::ActivationKernel<T>(type, relux_max_limit,
-                                                 leakyrelu_coefficient));
+      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
+          type, relux_max_limit, leakyrelu_coefficient);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index cc11a0efc55fe9568c3635c5a72b54f81b60b1ac..5e387d87684d833eb40c5ebe30e564ef74bb55cd 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -24,6 +24,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/addn.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -107,7 +108,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
   explicit AddNOp(OpConstructContext *context)
       : Operation(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::AddNKernel<T>);
+      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc
index 6010d71419dc9ec8f7f091281555f824e0e6e99b..09cfd8d4e0e0bd7ba09bf5f7e31c1bb57afa818b 100644
--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/arm/activation_neon.h
index a61b974b3c0dd002dece670a20381f0b9a4a4103..d640e689a2c1e91cb614826b9af1b53d7c90ef94 100644
--- a/mace/ops/arm/activation_neon.h
+++ b/mace/ops/arm/activation_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h
index c3451ea0e473b97b8befeb86d20a3743bdd83de9..8d28f5581c6ad43dd90fe1965e16e6ab7bec48c8 100644
--- a/mace/ops/arm/common_neon.h
+++ b/mace/ops/arm/common_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/conv_2d_neon.h b/mace/ops/arm/conv_2d_neon.h
deleted file mode 100644
index 711ef2c8ecf72bad68c8577338218a36e58e140a..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
-#define MACE_OPS_ARM_CONV_2D_NEON_H_
-
-#include "mace/core/types.h"
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-namespace ops {
-
-void Conv2dNeonK1x1S1(const float *input,
-                      const float *filter,
-                      const index_t batch,
-                      const index_t height,
-                      const index_t width,
-                      const index_t in_channels,
-                      const index_t out_channels,
-                      float *output,
-                      SGemm *sgemm,
-                      ScratchBuffer *scratch_buffer);
-
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-
-// calculate one output channel and one input channel
-inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t filter_height,
-                               const index_t filter_width,
-                               const index_t out_height,
-                               const index_t out_width,
-                               float *out_ptr,
-                               const int stride) {
-  for (index_t h = 0; h < out_height; ++h) {
-    for (index_t w = 0; w < out_width; ++w) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          out_ptr[h * out_width + w] +=
-              in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
-              filter_ptr[i * filter_width + j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc
deleted file mode 100644
index 8523e494cebf92e359b0d53c9a3e2a7ab8cc2fcb..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/utils.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK15x1Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t w,
-                               const index_t tile_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
-      for (int i = 0; i < 15; ++i) {
-        for (int j = 0; j < 1; ++j) {
-          out_ptr[io * out_image_size + ih * out_width + w + iw] +=
-              in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 1 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 4, Wo = 1, Co = 1
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_width =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
-              // load output
-              index_t out_offset = h * out_width + w + wt;
-              // output (1 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo = {out_ptr_base[out_offset],
-                                out_ptr_base[out_offset + out_width],
-                                out_ptr_base[out_offset + 2 * out_width],
-                                out_ptr_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w + wt;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width],
-                                 in_ptr_base[in_offset + 10 * in_width],
-                                 in_ptr_base[in_offset + 11 * in_width]};
-              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                  in_ptr_base[in_offset + 13 * in_width],
-                                  in_ptr_base[in_offset + 14 * in_width],
-                                  in_ptr_base[in_offset + 15 * in_width]};
-              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                  in_ptr_base[in_offset + 17 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
-              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
-              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
-              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
-              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
-              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              out_ptr_base[out_offset] = vo[0];
-              out_ptr_base[out_offset + out_width] = vo[1];
-              out_ptr_base[out_offset + 2 * out_width] = vo[2];
-              out_ptr_base[out_offset + 3 * out_width] = vo[3];
-            }  // wt
-          }    // h
-#else
-          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, out_width, w, tile_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // w
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_1x1.cc b/mace/ops/arm/conv_2d_neon_1x1.cc
deleted file mode 100644
index 819f5f334f466508f3e7b1affae07a2a156ea358..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon_1x1.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Conv2dNeonK1x1S1(const float *input,
-                      const float *filter,
-                      const index_t batch,
-                      const index_t height,
-                      const index_t width,
-                      const index_t in_channels,
-                      const index_t out_channels,
-                      float *output,
-                      SGemm *sgemm,
-                      ScratchBuffer *scratch_buffer) {
-  for (index_t b = 0; b < batch; ++b) {
-    sgemm->Run(filter,
-               input + b * in_channels * height * width,
-               1,
-               out_channels,
-               in_channels,
-               in_channels,
-               height * width,
-               false,
-               false,
-               true,
-               false,
-               output + b * out_channels * height * width,
-               scratch_buffer);
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc
deleted file mode 100644
index 33b9abbfebc2c921423b15288012487038d2b370..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
-
-namespace mace {
-namespace ops {
-
-inline void Conv2dCPUK1x15Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t h,
-                               const index_t tile_height,
-                               const index_t out_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < out_width; ++iw) {
-      for (int i = 0; i < 1; ++i) {
-        for (int j = 0; j < 15; ++j) {
-          out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
-              in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 15 + j];
-        }
-      }
-    }
-  }
-}
-
-// Ho = 1, Wo = 4, Co = 1
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_height =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (1 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo;
-              // load output
-              index_t out_offset = (h + ht) * out_width + w;
-              vo = vld1q_f32(out_ptr_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
-                  vi10, vi11, vi12, vi13, vi14, vi16;
-              // input offset
-              index_t in_offset = (h + ht) * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
-              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-              vi7 = vextq_f32(vi4, vi8, 3);
-              vi9 = vextq_f32(vi8, vi12, 1);
-              vi10 = vextq_f32(vi8, vi12, 2);
-              vi11 = vextq_f32(vi8, vi12, 3);
-              vi13 = vextq_f32(vi12, vi16, 1);
-              vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              vst1q_f32(out_ptr_base + out_offset, vo);
-            }  // w
-          }    // ht
-#else
-          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, h, tile_height, out_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // h
-    }      // m
-  }        // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc
deleted file mode 100644
index e5e249d39b3b51e4c0525d4c6777520c1ff4d846..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 1 height x 4 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 1, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 1 height x 4 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load output
-                index_t out_offset = h * out_width + w;
-                vo0 = vld1q_f32(out_ptr0_base + out_offset);
-
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-
-                vst1q_f32(out_ptr0_base + out_offset, vo0);
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc
deleted file mode 100644
index 7aa9309bfd605faa51a833e99eb0c15dd06ded3a..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-// Ho = 4, Wo = 1, Co = 4
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 4 height x 1 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t w = 0; w < out_width; ++w) {
-              // load output
-              index_t out_offset = h * out_width + w;
-              // output (4 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                 out_ptr0_base[out_offset + out_width],
-                                 out_ptr0_base[out_offset + 2 * out_width],
-                                 out_ptr0_base[out_offset + 3 * out_width]};
-              float32x4_t vo1 = {out_ptr1_base[out_offset],
-                                 out_ptr1_base[out_offset + out_width],
-                                 out_ptr1_base[out_offset + 2 * out_width],
-                                 out_ptr1_base[out_offset + 3 * out_width]};
-              float32x4_t vo2 = {out_ptr2_base[out_offset],
-                                 out_ptr2_base[out_offset + out_width],
-                                 out_ptr2_base[out_offset + 2 * out_width],
-                                 out_ptr2_base[out_offset + 3 * out_width]};
-              float32x4_t vo3 = {out_ptr3_base[out_offset],
-                                 out_ptr3_base[out_offset + out_width],
-                                 out_ptr3_base[out_offset + 2 * out_width],
-                                 out_ptr3_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              out_ptr0_base[out_offset] = vo0[0];
-              out_ptr0_base[out_offset + out_width] = vo0[1];
-              out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-              out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
-              out_ptr1_base[out_offset] = vo1[0];
-              out_ptr1_base[out_offset + out_width] = vo1[1];
-              out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
-              out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
-              out_ptr2_base[out_offset] = vo2[0];
-              out_ptr2_base[out_offset + out_width] = vo2[1];
-              out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
-              out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
-              out_ptr3_base[out_offset] = vo3[0];
-              out_ptr3_base[out_offset + out_width] = vo3[1];
-              out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
-              out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 7, 1, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 4 height x 1 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
-              for (index_t w = 0; w < out_width; ++w) {
-                // load output
-                index_t out_offset = h * out_width + w;
-                // output (1 outch x 4 height x 1 width): vo_outch_height
-                float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                   out_ptr0_base[out_offset + out_width],
-                                   out_ptr0_base[out_offset + 2 * out_width],
-                                   out_ptr0_base[out_offset + 3 * out_width]};
-
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // input (3 slide)
-                float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                   in_ptr_base[in_offset + 9 * in_width],
-                                   in_ptr_base[in_offset + 10 * in_width],
-                                   in_ptr_base[in_offset + 11 * in_width]};
-                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-                float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-                float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-
-                out_ptr0_base[out_offset] = vo0[0];
-                out_ptr0_base[out_offset + out_width] = vo0[1];
-                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/conv_winograd.h b/mace/ops/arm/conv_winograd.h
deleted file mode 100644
index 396d1870b96a4565e56ea5d48faf3e46d616a4da..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_winograd.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_
-#define MACE_OPS_ARM_CONV_WINOGRAD_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "mace/core/types.h"
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-namespace ops {
-
-void TransformFilter4x4(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-
-void TransformFilter8x8(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-
-void WinoGradConv3x3s1(const float *input,
-                       const float *filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-
-void WinoGradConv3x3s1(const float *input,
-                       const float *transformed_filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *transformed_input,
-                       float *transformed_output,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-
-void ConvRef3x3s1(const float *input,
-                  const float *filter,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_channels,
-                  float *output);
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_CONV_WINOGRAD_H_
diff --git a/mace/ops/arm/conv_winograd_test.cc b/mace/ops/arm/conv_winograd_test.cc
deleted file mode 100644
index 4f28472d5199dcb2f72667e30da10db82c0ba7d2..0000000000000000000000000000000000000000
--- a/mace/ops/arm/conv_winograd_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <random>
-
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/ops/arm/conv_winograd.h"
-
-namespace mace {
-namespace ops {
-
-TEST(ConvWinogradTest, winograd) {
-  index_t batch = 1;
-  index_t in_height = 32;
-  index_t in_width = 32;
-  index_t in_channels = 64;
-  index_t out_channels = 128;
-
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-  index_t input_size = batch * in_channels * in_height * in_width;
-  index_t filter_size = 3 * 3 * in_channels * out_channels;
-  index_t output_size = batch * out_channels * out_height * out_width;
-
-  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
-
-  input.Resize({batch, in_channels, in_height, in_width});
-  filter.Resize({out_channels, in_channels, 3, 3});
-  output.Resize({batch, out_channels, out_height, out_width});
-  output_ref.Resize({batch, out_channels, out_height, out_width});
-
-  float *input_data = input.mutable_data<float>();
-  float *filter_data = filter.mutable_data<float>();
-  float *output_data = output.mutable_data<float>();
-  float *output_data_ref = output.mutable_data<float>();
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-  std::generate(input_data, input_data + input_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-  std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-
-  ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
-                        in_channels, out_channels, output_data_ref);
-
-  SGemm sgemm;
-  ops::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
-                             in_width, in_channels, out_channels, 6,
-                             output_data, &sgemm, nullptr);
-
-  // test
-  for (index_t i = 0; i < output_size; ++i) {
-    EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i;
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h
index 62e3e9199b00345a8e41751bfb1b165e96cdd634..f45fa923bdd19c6420a4ab0e6b751541ce3b1f76 100644
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc
index 74ddbecc48c367c07692e43b6260ece23aee6abb..674864c8b6527631d4d5800a9e892bc662826bc7 100644
--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
index 356680949af572838a070c47f91a69427751d596..04f62325817f5a02919ea859c3e5c5ba4a974f40 100644
--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
index a023154aec04c94ff4dcc77767999522a87a0368..443a188f322c448c6e8bf36b14b3babc91725cf4 100644
--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h
index a4973ed59e0d31b4dfd97359e0cf3c99b3377c31..b610178c54fd097beb92bf0135d152cd4a96ed29 100644
--- a/mace/ops/arm/depthwise_conv2d_neon.h
+++ b/mace/ops/arm/depthwise_conv2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
index 652d0231bdc60c6f3b53c65b3a94131e7de47d15..ced509e0d87d796b7ff2ecedc5ae187a926502af 100644
--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/depthwise_conv2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h
index 70f2bb40545cde307ff1c8f75e69607bf6864486..8df6dba15bd61d22054f0d0ecac2b35bd060ec76 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon.h
+++ b/mace/ops/arm/depthwise_deconv2d_neon.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
index 404c903d30d0ca30695c94d889f9346764967c64..6bba47c280bfb1fe22055c7440e9180b6afdc98e 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/depthwise_deconv2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
index 1b59264e600c064f76dddfbcf3b6b4ec83d535a2..677eb152bb5f7d984a9f7bd003bcbf0e42a1da1f 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 
 namespace mace {
diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..799ee521b83dc22b3e192dc364f486b929b7df7f
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d.cc
@@ -0,0 +1,247 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <utility>
+#include <algorithm>
+
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input,
+                                          const Tensor *filter,
+                                          const int out_tile_height,
+                                          const int out_tile_width,
+                                          std::vector<index_t> *output_shape,
+                                          std::vector<int> *in_pad_size,
+                                          std::vector<int> *out_pad_size) {
+  in_pad_size->resize(4);
+  out_pad_size->resize(4);
+  output_shape->resize(4);
+
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+
+  const index_t stride_h = strides_[0];
+  const index_t stride_w = strides_[1];
+  const index_t dilation_h = dilations_[0];
+  const index_t dilation_w = dilations_[1];
+  const index_t filter_h = filter->dim(2);
+  const index_t filter_w = filter->dim(3);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 output_shape->data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       output_shape->data());
+  }
+  const index_t out_height = (*output_shape)[2];
+  const index_t out_width = (*output_shape)[3];
+  const index_t
+      padded_out_height = RoundUp<index_t>(out_height, out_tile_height);
+  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_width);
+  const index_t padded_in_height =
+      std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h
+          + (filter_h - 1) * dilation_h + 1);
+  const index_t padded_in_width =
+      std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w
+          + (filter_w - 1) * dilation_w + 1);
+
+  (*in_pad_size)[0] = paddings[0] >> 1;
+  (*in_pad_size)[1] =
+      static_cast<int>(padded_in_height - in_height - (*in_pad_size)[0]);
+  (*in_pad_size)[2] = paddings[1] >> 1;
+  (*in_pad_size)[3] =
+      static_cast<int>(padded_in_width - in_width - (*in_pad_size)[2]);
+
+  (*out_pad_size)[0] = 0;
+  (*out_pad_size)[1] = static_cast<int>(padded_out_height - out_height);
+  (*out_pad_size)[2] = 0;
+  (*out_pad_size)[3] = static_cast<int>(padded_out_width - out_width);
+}
+
+MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            Tensor *output,
+                                            const int out_tile_height,
+                                            const int out_tile_width,
+                                            std::unique_ptr<const Tensor>
+                                                *padded_input,
+                                            std::unique_ptr<Tensor>
+                                                *padded_output) {
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_height,
+                           out_tile_width,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t batch = input->dim(0);
+  const index_t in_channels = input->dim(1);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t out_channels = output->dim(1);
+  const index_t out_height = output->dim(2);
+  const index_t out_width = output->dim(3);
+
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
+  const bool is_out_padded =
+      padded_out_height != out_height || padded_out_width != out_width;
+
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size =
+      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
+          sizeof(float) * batch * in_channels * padded_in_height
+              * padded_in_width) : 0);
+  const index_t padded_out_size = is_out_padded ? PadAlignSize(
+      sizeof(float) * batch * out_channels * padded_out_height
+          * padded_out_width) : 0;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + padded_out_size);
+  if (is_in_padded) {
+    std::unique_ptr<Tensor>
+        padded_in =
+        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
+                            DataType::DT_FLOAT);
+    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
+    *padded_input = std::move(padded_in);
+  }
+  if (is_out_padded) {
+    std::unique_ptr<Tensor>
+    padded_out = make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
+                                     DataType::DT_FLOAT);
+    padded_out->Resize({batch, out_channels, padded_out_height,
+                        padded_out_width});
+    *padded_output = std::move(padded_out);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Conv2dBase::PadInput(const Tensor &src,
+                          const int pad_top,
+                          const int pad_left,
+                          mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = src.dim(0);
+  const index_t channels = src.dim(1);
+  const index_t height = src.dim(2);
+  const index_t width = src.dim(3);
+  const index_t padded_height = dst->dim(2);
+  const index_t padded_width = dst->dim(3);
+  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
+  const int pad_right = static_cast<int>(padded_width - width - pad_left);
+  auto in_data = src.data<float>();
+  auto padded_in_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = b * channels + c;
+      const float *in_base = in_data + bc * img_size;
+      float *padded_in_base = padded_in_data + bc * padded_img_size;
+
+      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width;
+      for (index_t h = 0; h < height; ++h) {
+        memset(padded_in_base,
+               0,
+               sizeof(float) * pad_left);
+        memcpy(padded_in_base + pad_left,
+               in_base,
+               sizeof(float) * width);
+        memset(padded_in_base + pad_left + width,
+               0,
+               sizeof(float) * pad_right);
+        in_base += width;
+        padded_in_base += padded_width;
+      }
+      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+    }
+  }
+}
+
+void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = dst->dim(0);
+  const index_t channels = dst->dim(1);
+  const index_t height = dst->dim(2);
+  const index_t width = dst->dim(3);
+  const index_t padded_height = src.dim(2);
+  const index_t padded_width = src.dim(3);
+
+  auto padded_out_data = src.data<float>();
+  auto out_data = dst->mutable_data<float>();
+
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = (b * channels + c);
+      float *out_base = out_data + bc * img_size;
+      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+
+      for (index_t h = 0; h < height; ++h) {
+        memcpy(out_base,
+               padded_out_base,
+               sizeof(float) * width);
+        out_base += width;
+        padded_out_base += padded_width;
+      }  // h
+    }  // c
+  }  // b
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h
index 7d77cf14941d30c24227ef10d948688519a7e995..832f6f2fa35d999ee6192e61f340f070776f5d1f 100644
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,10 +15,14 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_H_
 
+#include <vector>
+#include <memory>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -27,13 +31,51 @@ namespace fp32 {
 
 class Conv2dBase {
  public:
-  Conv2dBase() = default;
+  Conv2dBase(const std::vector<int> strides,
+             const std::vector<int> dilations,
+             const std::vector<int> paddings,
+             const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
+
   virtual ~Conv2dBase() = default;
+
   virtual MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       Tensor *output) = 0;
+
+ protected:
+  void CalOutputShapeAndPadSize(const Tensor *input,
+                                const Tensor *filter,
+                                const int out_tile_height,
+                                const int out_tile_width,
+                                std::vector<index_t> *output_shape,
+                                std::vector<int> *in_pad_size,
+                                std::vector<int> *out_pad_size);
+
+  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output,
+                                  const int out_tile_height,
+                                  const int out_tile_width,
+                                  std::unique_ptr<const Tensor> *padded_input,
+                                  std::unique_ptr<Tensor> *padded_output);
+
+  void PadInput(const Tensor &src,
+                const int pad_top,
+                const int pad_left,
+                Tensor *dst);
+  void UnPadOutput(const Tensor &src, Tensor *dst);
+
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc
index b34e19aae8c8712bf08052deaff7abfe6bd1eb95..d5e03652bbd25bad8eb43bfb67b2ef98092b9b2f 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
 
 namespace mace {
@@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                                const Tensor *filter,
                                Tensor *output) {
   index_t batch = input->dim(0);
-  index_t height = input->dim(2);
-  index_t width = input->dim(3);
+  index_t in_height = input->dim(2);
+  index_t in_width = input->dim(3);
   index_t in_channels = input->dim(1);
-  index_t out_channels = filter->dim(0);
-  MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width}));
-  context->device()->scratch_buffer()->Rewind();
+
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           1,
+                           1,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  const index_t out_channels = output_shape[1];
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+
+  // pad input and transform input
+  const bool is_in_padded =
+      in_height != padded_in_height || in_width != padded_in_width;
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
+  const index_t pack_filter_size =
+      PadAlignSize(sizeof(float) * out_channels * in_channels);
+  const index_t pack_input_size =
+      PadAlignSize(
+          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+  const index_t pack_output_size =
+      PadAlignSize(
+          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+
+  const index_t gemm_pack_size =
+      pack_filter_size + pack_input_size + pack_output_size;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + gemm_pack_size);
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
   return gemm_.Compute(context,
                        filter,
-                       input,
+                       padded_in,
                        batch,
                        out_channels,
                        in_channels,
                        in_channels,
-                       height * width,
+                       out_height * out_width,
                        false,
                        false,
                        false,
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h
index fd2077ec2a0f0458ee980ff6c35e2f11e1a6d0ad..68b792fd96b3c5dd77504614894d3008bbd01e01 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 
+#include <vector>
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
@@ -28,7 +29,8 @@ namespace fp32 {
 
 class Conv2dK1x1 : public Conv2dBase {
  public:
-  Conv2dK1x1() : gemm_(true) {}
+  Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK1x1() {}
 
   MaceStatus Compute(
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ff99d8021438d8b851b65d6ee2c662e01e72917
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -0,0 +1,821 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_1xn.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base =
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
+        float *out_ptr2_base =
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
+        float *out_ptr3_base =
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+          /* load filter (4 outch x 1 height x 4 width) */
+          float32x4_t vf00, vf01;
+          float32x4_t vf10, vf11;
+          float32x4_t vf20, vf21;
+          float32x4_t vf30, vf31;
+          vf00 = vld1q_f32(filter_ptr0);
+          vf01 = vld1q_f32(filter_ptr0 + 3);
+          vf10 = vld1q_f32(filter_ptr1);
+          vf11 = vld1q_f32(filter_ptr1 + 3);
+          vf20 = vld1q_f32(filter_ptr2);
+          vf21 = vld1q_f32(filter_ptr2 + 3);
+          vf30 = vld1q_f32(filter_ptr3);
+          vf31 = vld1q_f32(filter_ptr3 + 3);
+
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (4 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo0, vo1, vo2, vo3;
+              // load output
+              index_t out_offset = h * out_width + w;
+              vo0 = vld1q_f32(out_ptr0_base + out_offset);
+              vo1 = vld1q_f32(out_ptr1_base + out_offset);
+              vo2 = vld1q_f32(out_ptr2_base + out_offset);
+              vo3 = vld1q_f32(out_ptr3_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+              // input offset
+              index_t in_offset = h * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+              /* outch 0 */
+              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+              /* outch 1 */
+              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
+              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
+              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
+              /* outch 2 */
+              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
+              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
+              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
+              /* outch 3 */
+              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
+              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
+              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
+#else
+              /* outch 0 */
+              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+              /* outch 1 */
+              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+              /* outch 2 */
+              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+              /* outch 3 */
+              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
+#endif
+
+              vst1q_f32(out_ptr0_base + out_offset, vo0);
+              vst1q_f32(out_ptr1_base + out_offset, vo1);
+              vst1q_f32(out_ptr2_base + out_offset, vo2);
+              vst1q_f32(out_ptr3_base + out_offset, vo3);
+            }  // w
+          }    // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+            /* load filter (1 outch x 1 height x 4 width) */
+            float32x4_t vf00, vf01;
+            vf00 = vld1q_f32(filter_ptr0);
+            vf01 = vld1q_f32(filter_ptr0 + 3);
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0;
+                // load output
+                index_t out_offset = h * out_width + w;
+                vo0 = vld1q_f32(out_ptr0_base + out_offset);
+
+                // input (3 slide)
+                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+                // input offset
+                index_t in_offset = h * in_width + w;
+                // load input
+                vi0 = vld1q_f32(in_ptr_base + in_offset);
+                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                vi1 = vextq_f32(vi0, vi4, 1);
+                vi2 = vextq_f32(vi0, vi4, 2);
+                vi3 = vextq_f32(vi0, vi4, 3);
+                vi5 = vextq_f32(vi4, vi8, 1);
+                vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                vst1q_f32(out_ptr0_base + out_offset, vo0);
+              }  // w
+            }    // h
+          }  // c
+        }
+      }  // if
+    }    // m
+  }      // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       4,
+                       1,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base =
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
+        float *out_ptr2_base =
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
+        float *out_ptr3_base =
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+          const float
+              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+          /* load filter (4 outch x 4 height x 1 width) */
+          float32x4_t vf00, vf01;
+          float32x4_t vf10, vf11;
+          float32x4_t vf20, vf21;
+          float32x4_t vf30, vf31;
+          vf00 = vld1q_f32(filter_ptr0);
+          vf01 = vld1q_f32(filter_ptr0 + 3);
+          vf10 = vld1q_f32(filter_ptr1);
+          vf11 = vld1q_f32(filter_ptr1 + 3);
+          vf20 = vld1q_f32(filter_ptr2);
+          vf21 = vld1q_f32(filter_ptr2 + 3);
+          vf30 = vld1q_f32(filter_ptr3);
+          vf31 = vld1q_f32(filter_ptr3 + 3);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t w = 0; w < out_width; ++w) {
+              // load output
+              index_t out_offset = h * out_width + w;
+              // output (4 outch x 4 height x 1 width): vo_outch_height
+              float32x4_t vo0 = {out_ptr0_base[out_offset],
+                                 out_ptr0_base[out_offset + out_width],
+                                 out_ptr0_base[out_offset + 2 * out_width],
+                                 out_ptr0_base[out_offset + 3 * out_width]};
+              float32x4_t vo1 = {out_ptr1_base[out_offset],
+                                 out_ptr1_base[out_offset + out_width],
+                                 out_ptr1_base[out_offset + 2 * out_width],
+                                 out_ptr1_base[out_offset + 3 * out_width]};
+              float32x4_t vo2 = {out_ptr2_base[out_offset],
+                                 out_ptr2_base[out_offset + out_width],
+                                 out_ptr2_base[out_offset + 2 * out_width],
+                                 out_ptr2_base[out_offset + 3 * out_width]};
+              float32x4_t vo3 = {out_ptr3_base[out_offset],
+                                 out_ptr3_base[out_offset + out_width],
+                                 out_ptr3_base[out_offset + 2 * out_width],
+                                 out_ptr3_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+              /* outch 0 */
+              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+              /* outch 1 */
+              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
+              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
+              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
+              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
+              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
+              /* outch 2 */
+              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
+              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
+              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
+              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
+              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
+              /* outch 3 */
+              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
+              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
+              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
+              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
+              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
+#else
+              /* outch 0 */
+              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+              /* outch 1 */
+              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+              /* outch 2 */
+              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+              /* outch 3 */
+              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
+#endif
+
+              out_ptr0_base[out_offset] = vo0[0];
+              out_ptr0_base[out_offset + out_width] = vo0[1];
+              out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+              out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+              out_ptr1_base[out_offset] = vo1[0];
+              out_ptr1_base[out_offset + out_width] = vo1[1];
+              out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
+              out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
+              out_ptr2_base[out_offset] = vo2[0];
+              out_ptr2_base[out_offset + out_width] = vo2[1];
+              out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
+              out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
+              out_ptr3_base[out_offset] = vo3[0];
+              out_ptr3_base[out_offset + out_width] = vo3[1];
+              out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
+              out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
+            }  // w
+          }    // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+            /* load filter (1 outch x 4 height x 1 width) */
+            float32x4_t vf00, vf01;
+            vf00 = vld1q_f32(filter_ptr0);
+            vf01 = vld1q_f32(filter_ptr0 + 3);
+
+            for (index_t h = 0; h + 3 < out_height; h += 4) {
+              for (index_t w = 0; w < out_width; ++w) {
+                // load output
+                index_t out_offset = h * out_width + w;
+                // output (1 outch x 4 height x 1 width): vo_outch_height
+                float32x4_t vo0 = {out_ptr0_base[out_offset],
+                                   out_ptr0_base[out_offset + out_width],
+                                   out_ptr0_base[out_offset + 2 * out_width],
+                                   out_ptr0_base[out_offset + 3 * out_width]};
+
+                // input offset
+                index_t in_offset = h * in_width + w;
+                // input (3 slide)
+                float32x4_t vi0 = {in_ptr_base[in_offset],
+                                   in_ptr_base[in_offset + in_width],
+                                   in_ptr_base[in_offset + 2 * in_width],
+                                   in_ptr_base[in_offset + 3 * in_width]};
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                   in_ptr_base[in_offset + 5 * in_width],
+                                   in_ptr_base[in_offset + 6 * in_width],
+                                   in_ptr_base[in_offset + 7 * in_width]};
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                   in_ptr_base[in_offset + 9 * in_width],
+                                   in_ptr_base[in_offset + 10 * in_width],
+                                   in_ptr_base[in_offset + 11 * in_width]};
+                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+                float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+                float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                out_ptr0_base[out_offset] = vo0[0];
+                out_ptr0_base[out_offset + out_width] = vo0[1];
+                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+              }  // w
+            }    // h
+          }  // c
+        }
+      }  // if
+    }    // m
+  }      // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+
+// ====
+
+MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_height =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
+
+#pragma omp parallel for collapse(3) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo;
+              // load output
+              index_t out_offset = (h + ht) * out_width + w;
+              vo = vld1q_f32(out_ptr_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
+                  vi10, vi11, vi12, vi13, vi14, vi16;
+              // input offset
+              index_t in_offset = (h + ht) * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
+              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+              vi7 = vextq_f32(vi4, vi8, 3);
+              vi9 = vextq_f32(vi8, vi12, 1);
+              vi10 = vextq_f32(vi8, vi12, 2);
+              vi11 = vextq_f32(vi8, vi12, 3);
+              vi13 = vextq_f32(vi12, vi16, 1);
+              vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              vst1q_f32(out_ptr_base + out_offset, vo);
+            }  // w
+          }    // ht
+        }  // c
+      }    // h
+    }      // m
+  }        // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       4,
+                       1,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_width =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
+
+#pragma omp parallel for collapse(3) schedule(runtime)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
+              // load output
+              index_t out_offset = h * out_width + w + wt;
+              // output (1 outch x 4 height x 1 width): vo_outch_height
+              float32x4_t vo = {out_ptr_base[out_offset],
+                                out_ptr_base[out_offset + out_width],
+                                out_ptr_base[out_offset + 2 * out_width],
+                                out_ptr_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w + wt;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width],
+                                 in_ptr_base[in_offset + 10 * in_width],
+                                 in_ptr_base[in_offset + 11 * in_width]};
+              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                                  in_ptr_base[in_offset + 13 * in_width],
+                                  in_ptr_base[in_offset + 14 * in_width],
+                                  in_ptr_base[in_offset + 15 * in_width]};
+              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                                  in_ptr_base[in_offset + 17 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
+              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
+              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
+              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
+              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
+              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              out_ptr_base[out_offset] = vo[0];
+              out_ptr_base[out_offset + out_width] = vo[1];
+              out_ptr_base[out_offset + 2 * out_width] = vo[2];
+              out_ptr_base[out_offset + 3 * out_width] = vo[3];
+            }  // wt
+          }    // h
+        }  // c
+      }    // w
+    }      // m
+  }        // b
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4a5e8995f9ebf5b85c2622684c13e558eb2900f
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
@@ -0,0 +1,86 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK1x7S1 : public Conv2dBase {
+ public:
+  Conv2dK1x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x1S1 : public Conv2dBase {
+ public:
+  Conv2dK7x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK1x15S1 : public Conv2dBase {
+ public:
+  Conv2dK1x15S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK1x15S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK15x1S1 : public Conv2dBase {
+ public:
+  Conv2dK15x1S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK15x1S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc
similarity index 85%
rename from mace/ops/arm/conv_2d_neon_3x3.cc
rename to mace/ops/arm/fp32/conv_2d_3x3.cc
index ecae6810696d07d82d688a183720c7acb3243f8d..a8ce5fa64074c08362d0e839a80d111221bc19cb 100644
--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,49 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/core/macros.h"
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       2,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
 
-// Ho = 2, Wo = 4, Co = 2
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 1 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
+          const float
+              *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
 
-#if defined(MACE_ENABLE_NEON)
           float *out_ptr1 = out_ptr1_base;
           const float *in_ptr1 =
-              input + b * in_batch_size + c * in_image_size + 1 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 1 * in_width;
           const float *in_ptr2 =
-              input + b * in_batch_size + c * in_image_size + 2 * in_width;
+              input_data + b * in_batch_size + c * in_image_size + 2 * in_width;
           const float *in_ptr3 =
-              input + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
-#endif
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size + 3 * in_width;
+          const float
+              *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
+
+#if defined(__aarch64__)
           float *out_ptr0 = out_ptr0_base;
 
           // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input,
             out_ptr0 += out_width;
             out_ptr1 += out_width;
           }                      // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
           float *out_ptr0 = out_ptr0_base;
 
           // load filter (2 outch x 3 height x 3 width): vf_outch_height
@@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input,
             out_ptr0 += out_width;
             out_ptr1 += out_width;
           }  // h
-#else
-          for (index_t oc = 0; oc < 2; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9,
-                               in_width, 3, 3, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
 #endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr0 =
-                input + b * in_batch_size + c * in_image_size;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
             const float *in_ptr1 =
-                input + b * in_batch_size + c * in_image_size + 1 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 1 * in_width;
             const float *in_ptr2 =
-                input + b * in_batch_size + c * in_image_size + 2 * in_width;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 2 * in_width;
             const float *in_ptr3 =
-                input + b * in_batch_size + c * in_image_size + 3 * in_width;
-#endif
-            const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
+                input_data + b * in_batch_size + c * in_image_size
+                    + 3 * in_width;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
 
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
             float *out_ptr0 = out_ptr0_base;
 
             // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input,
 
               out_ptr0 += out_width;
             }                    // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
             float *out_ptr0 = out_ptr0_base;
 
             // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input,
 
               out_ptr0 += out_width;
             }  // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
-                               out_width, out_ptr0_base, 1);
 #endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input,
         const index_t in_width = in_shape[3];
         const index_t out_height = out_shape[2];
         const index_t out_width = out_shape[3];
-        const float *in_base = input + b * in_batch_size + c * in_image_size;
-        const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
-        float *out_base = output + b * out_batch_size + m * out_image_size;
+        const float
+            *in_base = input_data + b * in_batch_size + c * in_image_size;
+        const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
+        float *out_base = output_data + b * out_batch_size + m * out_image_size;
 
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(__aarch64__)
         // load filter (1 outch x 3 height x 3 width): vf_outch_height
         float32x4_t vf00, vf01, vf02;
         vf00 = vld1q_f32(filter_ptr);
@@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input,
             vst1q_f32(out_base + out_offset, vo);
           }                      // w
         }                        // h
-#elif defined(MACE_ENABLE_NEON)  // arm v7
+#else  // arm v7
         // load filter (1 outch x 3 height x 3 width): vf_outch_height
         float32x2_t vf01, vf23, vf45, vf67, vf78;
         vf01 = vld1_f32(filter_ptr);
@@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input,
             vst1q_f32(out_base + out_offset, vo);
           }  // w
         }    // h
-#else
-        Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
-                           out_width, out_base, 2);
 #endif
       }  // c
     }    // m
   }      // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h
new file mode 100644
index 0000000000000000000000000000000000000000..66d47801c39fee076ca0fd0bddff806a8e30c127
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
@@ -0,0 +1,60 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK3x3S1 : public Conv2dBase {
+ public:
+  Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK3x3S2 : public Conv2dBase {
+ public:
+  Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
diff --git a/mace/ops/arm/conv_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
similarity index 60%
rename from mace/ops/arm/conv_winograd.cc
rename to mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
index 11d4fbf0d52eac3d8c7abab87a5f5b95693c5df5..b894a60a964ff9b149abc5d93852f76a658b9b94 100644
--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,20 +14,375 @@
 
 #include <algorithm>
 
-#include "mace/ops/arm/conv_winograd.h"
+#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
+                                       const Tensor *input,
+                                       const Tensor *filter,
+                                       Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_channels = input->dim(1);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t out_channels = filter->dim(0);
+
+  // When size of input feature map is bigger than 16x16,
+  // set winograd out tile size to 6 to get higher performance.
+  index_t out_tile_size = 2;
+  if (in_height > 16 && in_width > 16) {
+    out_tile_size = 6;
+  }
+
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_size,
+                           out_tile_size,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard input_guard(input);
+  Tensor::MappingGuard output_guard(output);
+
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const int pad_top = in_pad_size[0];
+  const int pad_left = in_pad_size[2];
+
+  bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
+  bool is_out_padded =
+      padded_out_height != out_height || padded_out_width != out_width;
+
+  const index_t
+      tile_height_count = padded_out_height / out_tile_size;
+  const index_t tile_width_count = padded_out_width / out_tile_size;
+  const index_t tile_count = tile_height_count * tile_width_count;
+  const index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
+
+  // pad input and transform input
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
+  const index_t padded_out_size = is_out_padded ? PadAlignSize(
+      sizeof(float) * batch * out_channels * padded_out_height
+          * padded_out_width) : 0;
+  const index_t transformed_in_size = PadAlignSize(
+      sizeof(float) * batch * in_tile_area * in_channels * tile_count);
+  const index_t transformed_out_size = PadAlignSize(
+      sizeof(float) * batch * in_tile_area * out_channels * tile_count);
+  const index_t transformed_filter_size =
+      PadAlignSize(sizeof(float) * in_tile_area * out_channels * in_channels);
+  const index_t gemm_pack_size =
+      transformed_in_size + transformed_filter_size + transformed_filter_size;
+
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(
+      padded_in_size + padded_out_size + transformed_in_size
+          + transformed_out_size + gemm_pack_size);
+
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    Tensor::MappingGuard guard(&tmp_padded_in);
+    PadInput(*input, pad_top, pad_left, &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
+
+  Tensor *padded_out = output;
+  Tensor tmp_padded_out
+      (scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT);
+  if (is_out_padded) {
+    padded_out = &tmp_padded_out;
+    padded_out->Resize({batch, out_channels, padded_out_height,
+                        padded_out_width});
+  }
+
+  auto transformed_in = scratch_buffer->Scratch(transformed_in_size);
+  auto transformed_out = scratch_buffer->Scratch(transformed_out_size);
+  auto padded_in_data = padded_in->data<float>();
+  auto padded_out_data = padded_out->mutable_data<float>();
+  auto transformed_in_data = transformed_in.mutable_data<float>();
+  auto transformed_out_data = transformed_out.mutable_data<float>();
+  auto filter_data = filter->data<float>();
+
+  if (!filter->is_weight() || out_tile_size != out_tile_size_) {
+    out_tile_size_ = out_tile_size;
+    transformed_filter_.reset(new Tensor);
+    transformed_filter_->Resize({in_tile_area, out_channels, in_channels});
+    auto transformed_filter_data = transformed_filter_->mutable_data<float>();
+    switch (out_tile_size) {
+      case 2:
+        TransformFilter4x4(filter_data,
+                           in_channels,
+                           out_channels,
+                           transformed_filter_data);
+        break;
+      case 6:
+        TransformFilter8x8(filter_data,
+                           in_channels,
+                           out_channels,
+                           transformed_filter_data);
+        break;
+      default:MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  switch (out_tile_size) {
+    case 2:
+      TransformInput4x4(padded_in_data,
+                        batch,
+                        padded_in_height,
+                        padded_in_width,
+                        in_channels,
+                        tile_count,
+                        transformed_in_data);
+      break;
+    case 6:
+      TransformInput8x8(padded_in_data,
+                        batch,
+                        padded_in_height,
+                        padded_in_width,
+                        in_channels,
+                        tile_count,
+                        transformed_in_data);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
+
+  const index_t scratch_buffer_offset = scratch_buffer->offset();
+  const index_t transformed_in_size_per_batch =
+      in_tile_area * in_channels * tile_count * sizeof(float);
+  const index_t transformed_out_size_per_batch =
+      in_tile_area * out_channels * tile_count * sizeof(float);
+  for (index_t b = 0; b < batch; ++b) {
+    scratch_buffer->Rewind(scratch_buffer_offset);
+
+    BufferSlice transformed_in_slice(&transformed_in,
+                                     b * transformed_in_size_per_batch,
+                                     transformed_in_size_per_batch);
+    BufferSlice transformed_out_slice(&transformed_out,
+                                      b * transformed_out_size_per_batch,
+                                      transformed_out_size_per_batch);
+
+    Tensor transformed_in_this_batch(transformed_in_slice, DataType::DT_FLOAT);
+    transformed_in_this_batch.Resize({in_tile_area, in_channels, tile_count});
+    Tensor
+        transformed_out_this_batch(transformed_out_slice, DataType::DT_FLOAT);
+    transformed_out_this_batch.Resize({in_tile_area, out_channels, tile_count});
+
+    gemm_.Compute(context,
+                  transformed_filter_.get(),
+                  &transformed_in_this_batch,
+                  in_tile_area,
+                  out_channels,
+                  in_channels,
+                  in_channels,
+                  tile_count,
+                  false,
+                  false,
+                  false,
+                  true,
+                  true,
+                  &transformed_out_this_batch);
+  }
+
+  switch (out_tile_size) {
+    case 2:
+      TransformOutput4x4(transformed_out_data,
+                         batch,
+                         padded_out_height,
+                         padded_out_width,
+                         out_channels,
+                         tile_count,
+                         padded_out_data);
+      break;
+    case 6:
+      TransformOutput8x8(transformed_out_data,
+                         batch,
+                         padded_out_height,
+                         padded_out_width,
+                         out_channels,
+                         tile_count,
+                         padded_out_data);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
+
+  UnPadOutput(*padded_out, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+// OCHW => TOC
+void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
+                                            const index_t in_channels,
+                                            const index_t out_channels,
+                                            float *output) {
+  const index_t stride = out_channels * in_channels;
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t m = 0; m < out_channels; ++m) {
+    for (index_t c = 0; c < in_channels; ++c) {
+      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
+      float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+          s15;
+
+      // load filter
+      index_t filter_offset = (m * in_channels + c) * 9;
+      g0 = filter[filter_offset];
+      g1 = filter[filter_offset + 1];
+      g2 = filter[filter_offset + 2];
+      g3 = filter[filter_offset + 3];
+      g4 = filter[filter_offset + 4];
+      g5 = filter[filter_offset + 5];
+      g6 = filter[filter_offset + 6];
+      g7 = filter[filter_offset + 7];
+      g8 = filter[filter_offset + 8];
+
+      // s = G * g * GT
+      s0 = g0;
+      s1 = (g0 + g2 + g1) * 0.5f;
+      s2 = (g0 + g2 - g1) * 0.5f;
+      s3 = g2;
+      s4 = (g0 + g6 + g3) * 0.5f;
+      s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
+      s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
+      s7 = (g2 + g8 + g5) * 0.5f;
+      s8 = (g0 + g6 - g3) * 0.5f;
+      s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
+      s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
+      s11 = (g2 + g8 - g5) * 0.5f;
+      s12 = g6;
+      s13 = (g6 + g8 + g7) * 0.5f;
+      s14 = (g6 + g8 - g7) * 0.5f;
+      s15 = g8;
+
+      // store output
+      index_t output_offset = m * in_channels + c;
+      output[output_offset + 0 * stride] = s0;
+      output[output_offset + 1 * stride] = s1;
+      output[output_offset + 2 * stride] = s2;
+      output[output_offset + 3 * stride] = s3;
+
+      output[output_offset + 4 * stride] = s4;
+      output[output_offset + 5 * stride] = s5;
+      output[output_offset + 6 * stride] = s6;
+      output[output_offset + 7 * stride] = s7;
+
+      output[output_offset + 8 * stride] = s8;
+      output[output_offset + 9 * stride] = s9;
+      output[output_offset + 10 * stride] = s10;
+      output[output_offset + 11 * stride] = s11;
+
+      output[output_offset + 12 * stride] = s12;
+      output[output_offset + 13 * stride] = s13;
+      output[output_offset + 14 * stride] = s14;
+      output[output_offset + 15 * stride] = s15;
+    }
+  }
+}
+
+// OCHW => TOC
+/**
+ * G =
+⎡ 1      0      0  ⎤
+⎢                  ⎥
+⎢-2/9  -2/9   -2/9 ⎥
+⎢                  ⎥
+⎢-2/9   2/9   -2/9 ⎥
+⎢                  ⎥
+⎢1/90  1/45   2/45 ⎥
+⎢                  ⎥
+⎢1/90  -1/45  2/45 ⎥
+⎢                  ⎥
+⎢1/45  1/90   1/180⎥
+⎢                  ⎥
+⎢1/45  -1/90  1/180⎥
+⎢                  ⎥
+⎣ 0      0      1  ⎦
+ */
+void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter,
+                                            const index_t in_channels,
+                                            const index_t out_channels,
+                                            float *output) {
+  const index_t stride = out_channels * in_channels;
+
+  const float G[8][3] = {{1.0f, 0.0f, 0.0f},
+                         {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                         {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                         {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                         {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                         {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                         {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                         {0.0f, 0.0f, 1.0f}};
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t m = 0; m < out_channels; ++m) {
+    for (index_t c = 0; c < in_channels; ++c) {
+      // load filter
+      index_t filter_offset = (m * in_channels + c) * 9;
+      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
+      g0 = filter[filter_offset];
+      g1 = filter[filter_offset + 1];
+      g2 = filter[filter_offset + 2];
+      g3 = filter[filter_offset + 3];
+      g4 = filter[filter_offset + 4];
+      g5 = filter[filter_offset + 5];
+      g6 = filter[filter_offset + 6];
+      g7 = filter[filter_offset + 7];
+      g8 = filter[filter_offset + 8];
+
+      float s[3][8];
+      for (int i = 0; i < 8; ++i) {
+        s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
+        s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
+        s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
+      }
+
+      // store output
+      index_t output_offset = m * in_channels + c;
+      for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+          output[output_offset + (i * 8 + j) * stride] =
+              G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
+        }
+      }
+    }
+  }
+}
 
-namespace {
 // NCHW => NTCB (T: in tile pixels, B: tile indices)
-void TransformInput4x4(const float *input,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t tile_count,
-                       float *output) {
+void Conv2dK3x3Winograd::TransformInput4x4(const float *input,
+                                           const index_t batch,
+                                           const index_t in_height,
+                                           const index_t in_width,
+                                           const index_t in_channels,
+                                           const index_t tile_count,
+                                           float *output) {
   const index_t stride = in_channels * tile_count;
   const index_t in_height_width = in_height * in_width;
   const index_t input_batch_size = in_height_width * in_channels;
@@ -46,7 +401,7 @@ void TransformInput4x4(const float *input,
 
           // load tile data
           const float *input_ptr = input + n * input_batch_size +
-                                   c * in_height_width + h * in_width + w;
+              c * in_height_width + h * in_width + w;
           d0 = input_ptr[0];
           d1 = input_ptr[1];
           d2 = input_ptr[2];
@@ -133,22 +488,14 @@ void TransformInput4x4(const float *input,
 ⎢0   -2     4     5/2    -5    -1/2   1   0⎥
 ⎢                                          ⎥
 ⎣0   -1     0    21/4     0    -21/4  0   1⎦
-
- * @param input
- * @param batch
- * @param in_height
- * @param in_width
- * @param in_channels
- * @param tile_count
- * @param output
  */
-void TransformInput8x8(const float *input,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t tile_count,
-                       float *output) {
+void Conv2dK3x3Winograd::TransformInput8x8(const float *input,
+                                           const index_t batch,
+                                           const index_t in_height,
+                                           const index_t in_width,
+                                           const index_t in_channels,
+                                           const index_t tile_count,
+                                           float *output) {
   const index_t stride = in_channels * tile_count;
   const index_t in_height_width = in_height * in_width;
   const index_t input_batch_size = in_height_width * in_channels;
@@ -162,7 +509,7 @@ void TransformInput8x8(const float *input,
       for (index_t h = 0; h < in_height - 2; h += 6) {
         for (index_t w = 0; w < in_width - 2; w += 6) {
           const float *input_ptr = input + n * input_batch_size +
-                                   c * in_height_width + h * in_width + w;
+              c * in_height_width + h * in_width + w;
 
           for (int i = 0; i < 8; ++i) {
             float d0, d1, d2, d3, d4, d5, d6, d7;
@@ -235,57 +582,14 @@ void TransformInput8x8(const float *input,
   }
 }
 
-// TOC * NTCB => NTOB
-void BatchGemm(const float *input,
-               const float *filter,
-               index_t batch,
-               index_t in_channels,
-               index_t out_channels,
-               index_t tile_count,
-               int out_tile_size,
-               float *output,
-               SGemm *sgemm,
-               ScratchBuffer *scratch_buffer) {
-  const int in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
-  const index_t in_batch_size = in_tile_area * in_channels * tile_count;
-  const index_t out_batch_size = in_tile_area * out_channels * tile_count;
-
-  index_t scratch_buffer_offset = 0;
-  if (scratch_buffer) {
-    scratch_buffer_offset = scratch_buffer->offset();
-  }
-  // 'batch' is not gemm batch, 'in_tile_area' is. gemm is not thread safe,
-  // so we loop batch using single thread.
-  // Scratch buffer should be rewind to the initial position to use same
-  // scratch memory for each batch.
-  for (int b = 0; b < batch; ++b) {
-    if (scratch_buffer) {
-      scratch_buffer->Rewind(scratch_buffer_offset);
-    }
-    sgemm->Run(filter,
-               input + b * in_batch_size,
-               in_tile_area,
-               out_channels,
-               in_channels,
-               in_channels,
-               tile_count,
-               false,
-               false,
-               true,
-               false,
-               output + b * out_batch_size,
-               scratch_buffer);
-  }
-}
-
 // NTOB => NToOB => NOHoWo
-void TransformOutput4x4(const float *input,
-                        index_t batch,
-                        index_t out_height,
-                        index_t out_width,
-                        index_t out_channels,
-                        index_t tile_count,
-                        float *output) {
+void Conv2dK3x3Winograd::TransformOutput4x4(const float *input,
+                                            index_t batch,
+                                            index_t out_height,
+                                            index_t out_width,
+                                            index_t out_channels,
+                                            index_t tile_count,
+                                            float *output) {
   const index_t stride = out_channels * tile_count;
   const index_t input_batch_size = 16 * stride;
   const index_t out_image_size = out_height * out_width;
@@ -339,7 +643,7 @@ void TransformOutput4x4(const float *input,
           v3 = s3 - s5 - s7;
 
           float *output_ptr = output + n * output_batch_size +
-                              m * out_image_size + h * out_width + w;
+              m * out_image_size + h * out_width + w;
           output_ptr[0] = v0;
           output_ptr[1] = v1;
           output_ptr[out_width] = v2;
@@ -366,22 +670,14 @@ void TransformOutput4x4(const float *input,
 ⎢0  1  1   16  16   2    2   0⎥
 ⎢                             ⎥
 ⎣0  1  -1  32  -32  1   -1   1⎦
- *
- * @param input
- * @param batch
- * @param out_height
- * @param out_width
- * @param out_channels
- * @param tile_count
- * @param output
  */
-void TransformOutput8x8(const float *input,
-                        index_t batch,
-                        index_t out_height,
-                        index_t out_width,
-                        index_t out_channels,
-                        index_t tile_count,
-                        float *output) {
+void Conv2dK3x3Winograd::TransformOutput8x8(const float *input,
+                                            index_t batch,
+                                            index_t out_height,
+                                            index_t out_width,
+                                            index_t out_channels,
+                                            index_t tile_count,
+                                            float *output) {
   const index_t stride = out_channels * tile_count;
   const index_t input_batch_size = 64 * stride;
   const index_t out_image_size = out_height * out_width;
@@ -426,7 +722,7 @@ void TransformOutput8x8(const float *input,
           }
 
           float *output_ptr = output + n * output_batch_size +
-                              m * out_image_size + h * out_width + w;
+              m * out_image_size + h * out_width + w;
 
           for (int i = 0; i < 6; ++i) {
             float d0, d1, d2, d3, d4, d5, d6, d7;
@@ -460,291 +756,8 @@ void TransformOutput8x8(const float *input,
     }
   }
 }
-}  // namespace
-
-// OCHW => TOC
-// no need to optimize, it will exist in converter
-void TransformFilter4x4(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output) {
-  const index_t stride = out_channels * in_channels;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t m = 0; m < out_channels; ++m) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
-      float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          s15;
-
-      // load filter
-      index_t filter_offset = (m * in_channels + c) * 9;
-      g0 = filter[filter_offset];
-      g1 = filter[filter_offset + 1];
-      g2 = filter[filter_offset + 2];
-      g3 = filter[filter_offset + 3];
-      g4 = filter[filter_offset + 4];
-      g5 = filter[filter_offset + 5];
-      g6 = filter[filter_offset + 6];
-      g7 = filter[filter_offset + 7];
-      g8 = filter[filter_offset + 8];
-
-      // s = G * g * GT
-      s0 = g0;
-      s1 = (g0 + g2 + g1) * 0.5f;
-      s2 = (g0 + g2 - g1) * 0.5f;
-      s3 = g2;
-      s4 = (g0 + g6 + g3) * 0.5f;
-      s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
-      s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
-      s7 = (g2 + g8 + g5) * 0.5f;
-      s8 = (g0 + g6 - g3) * 0.5f;
-      s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
-      s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
-      s11 = (g2 + g8 - g5) * 0.5f;
-      s12 = g6;
-      s13 = (g6 + g8 + g7) * 0.5f;
-      s14 = (g6 + g8 - g7) * 0.5f;
-      s15 = g8;
-
-      // store output
-      index_t output_offset = m * in_channels + c;
-      output[output_offset + 0 * stride] = s0;
-      output[output_offset + 1 * stride] = s1;
-      output[output_offset + 2 * stride] = s2;
-      output[output_offset + 3 * stride] = s3;
-
-      output[output_offset + 4 * stride] = s4;
-      output[output_offset + 5 * stride] = s5;
-      output[output_offset + 6 * stride] = s6;
-      output[output_offset + 7 * stride] = s7;
-
-      output[output_offset + 8 * stride] = s8;
-      output[output_offset + 9 * stride] = s9;
-      output[output_offset + 10 * stride] = s10;
-      output[output_offset + 11 * stride] = s11;
-
-      output[output_offset + 12 * stride] = s12;
-      output[output_offset + 13 * stride] = s13;
-      output[output_offset + 14 * stride] = s14;
-      output[output_offset + 15 * stride] = s15;
-    }
-  }
-}
-
-// OCHW => TOC
-// no need to optimize, it will exist in converter
-/**
- * G =
-⎡ 1      0      0  ⎤
-⎢                  ⎥
-⎢-2/9  -2/9   -2/9 ⎥
-⎢                  ⎥
-⎢-2/9   2/9   -2/9 ⎥
-⎢                  ⎥
-⎢1/90  1/45   2/45 ⎥
-⎢                  ⎥
-⎢1/90  -1/45  2/45 ⎥
-⎢                  ⎥
-⎢1/45  1/90   1/180⎥
-⎢                  ⎥
-⎢1/45  -1/90  1/180⎥
-⎢                  ⎥
-⎣ 0      0      1  ⎦
- *
- * @param filter
- * @param in_channels
- * @param out_channels
- * @param output
- */
-void TransformFilter8x8(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output) {
-  const index_t stride = out_channels * in_channels;
-
-  const float G[8][3] = {{1.0f, 0.0f, 0.0f},
-                         {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-                         {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-                         {1.0f / 90, 1.0f / 45, 2.0f / 45},
-                         {1.0f / 90, -1.0f / 45, 2.0f / 45},
-                         {1.0f / 45, 1.0f / 90, 1.0f / 180},
-                         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-                         {0.0f, 0.0f, 1.0f}};
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t m = 0; m < out_channels; ++m) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      // load filter
-      index_t filter_offset = (m * in_channels + c) * 9;
-      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
-      g0 = filter[filter_offset];
-      g1 = filter[filter_offset + 1];
-      g2 = filter[filter_offset + 2];
-      g3 = filter[filter_offset + 3];
-      g4 = filter[filter_offset + 4];
-      g5 = filter[filter_offset + 5];
-      g6 = filter[filter_offset + 6];
-      g7 = filter[filter_offset + 7];
-      g8 = filter[filter_offset + 8];
-
-      float s[3][8];
-      for (int i = 0; i < 8; ++i) {
-        s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
-        s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
-        s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
-      }
-
-      // store output
-      index_t output_offset = m * in_channels + c;
-      for (int i = 0; i < 8; ++i) {
-        for (int j = 0; j < 8; ++j) {
-          output[output_offset + (i * 8 + j) * stride] =
-              G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
-        }
-      }
-    }
-  }
-}
-
-void WinoGradConv3x3s1(const float *input,
-                       const float *transformed_filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *transformed_input,
-                       float *transformed_output,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer) {
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-  index_t tile_height_count =
-      RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
-  index_t tile_width_count =
-      RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
-  index_t tile_count = tile_height_count * tile_width_count;
-
-  switch (out_tile_size) {
-    case 2:
-      TransformInput4x4(input, batch, in_height, in_width, in_channels,
-                        tile_count, transformed_input);
-      break;
-    case 6:
-      TransformInput8x8(input, batch, in_height, in_width, in_channels,
-                        tile_count, transformed_input);
-      break;
-    default:
-      MACE_NOT_IMPLEMENTED;
-  }
-
-  BatchGemm(transformed_input, transformed_filter, batch, in_channels,
-            out_channels, tile_count, out_tile_size, transformed_output,
-            sgemm, scratch_buffer);
-
-  switch (out_tile_size) {
-    case 2:
-      TransformOutput4x4(transformed_output, batch, out_height, out_width,
-                         out_channels, tile_count, output);
-      break;
-    case 6:
-      TransformOutput8x8(transformed_output, batch, out_height, out_width,
-                         out_channels, tile_count, output);
-      break;
-    default:
-      MACE_NOT_IMPLEMENTED;
-  }
-}
-
-void WinoGradConv3x3s1(const float *input,
-                       const float *filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer) {
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-  index_t tile_height_count =
-      RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
-  index_t tile_width_count =
-      RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
-  index_t tile_count = tile_height_count * tile_width_count;
-  index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
-  index_t transformed_input_size =
-      in_tile_area * batch * in_channels * tile_count;
-  index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
-  index_t transformed_output_size =
-      in_tile_area * batch * out_channels * tile_count;
-
-  float *transformed_input = new float[transformed_input_size];    // TNCB
-  float *transformed_filter = new float[transformed_filter_size];  // TOC
-  float *transformed_output = new float[transformed_output_size];
-
-  switch (out_tile_size) {
-    case 2:
-      TransformFilter4x4(filter, in_channels, out_channels, transformed_filter);
-      break;
-    case 6:
-      TransformFilter8x8(filter, in_channels, out_channels, transformed_filter);
-      break;
-    default:
-      MACE_NOT_IMPLEMENTED;
-  }
-
-  WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width,
-                    in_channels, out_channels, out_tile_size, transformed_input,
-                    transformed_output, output, sgemm, scratch_buffer);
-
-  delete[] transformed_input;
-  delete[] transformed_filter;
-  delete[] transformed_output;
-}
-
-void ConvRef3x3s1(const float *input,
-                  const float *filter,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_channels,
-                  float *output) {
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-
-#pragma omp parallel for collapse(4) schedule(runtime)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          index_t out_offset =
-              ((b * out_channels + m) * out_height + h) * out_width + w;
-          output[out_offset] = 0;
-          for (index_t c = 0; c < in_channels; ++c) {
-            for (index_t kh = 0; kh < 3; ++kh) {
-              for (index_t kw = 0; kw < 3; ++kw) {
-                index_t ih = h + kh;
-                index_t iw = w + kw;
-                index_t in_offset =
-                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
-                index_t filter_offset =
-                    (((m * in_channels) + c) * 3 + kh) * 3 + kw;
-                output[out_offset] += input[in_offset] * filter[filter_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ed8646b17c12424a884611ac22698c6d3a9bf05
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -0,0 +1,102 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK3x3Winograd : public Conv2dBase {
+ public:
+  Conv2dK3x3Winograd(const std::vector<int> paddings,
+                     const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
+        gemm_(),
+        transformed_filter_(nullptr),
+        out_tile_size_(0) {}
+
+  virtual ~Conv2dK3x3Winograd() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+
+ private:
+  void TransformFilter4x4(const float *filter,
+                          const index_t in_channels,
+                          const index_t out_channels,
+                          float *output);
+
+  void TransformFilter8x8(const float *filter,
+                          const index_t in_channels,
+                          const index_t out_channels,
+                          float *output);
+
+  void TransformInput4x4(const float *input,
+                         const index_t batch,
+                         const index_t in_height,
+                         const index_t in_width,
+                         const index_t in_channels,
+                         const index_t tile_count,
+                         float *output);
+
+  void TransformInput8x8(const float *input,
+                         const index_t batch,
+                         const index_t in_height,
+                         const index_t in_width,
+                         const index_t in_channels,
+                         const index_t tile_count,
+                         float *output);
+
+  void TransformOutput4x4(const float *input,
+                          index_t batch,
+                          index_t out_height,
+                          index_t out_width,
+                          index_t out_channels,
+                          index_t tile_count,
+                          float *output);
+
+  void TransformOutput8x8(const float *input,
+                          index_t batch,
+                          index_t out_height,
+                          index_t out_width,
+                          index_t out_channels,
+                          index_t tile_count,
+                          float *output);
+
+  Gemm gemm_;
+  std::unique_ptr<Tensor> transformed_filter_;
+  index_t out_tile_size_;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc
similarity index 77%
rename from mace/ops/arm/conv_2d_neon_5x5.cc
rename to mace/ops/arm/fp32/conv_2d_5x5.cc
index 81d892975ae1c431708d986f5ff7f0666a399e9a..264e48fa13f91756c47fae6f5b9db9ed7f2cc57c 100644
--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
 
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
   /* load filter (4 outch x 1 height x 4 width) */        \
@@ -76,12 +76,40 @@ namespace ops {
   vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
   vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
+
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 25 + c * 25;
+              filter_data + (m + 1) * in_channels * 25 + c * 25;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 25 + c * 25;
+              filter_data + (m + 2) * in_channels * 25 + c * 25;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 25 + c * 25;
+              filter_data + (m + 3) * in_channels * 25 + c * 25;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input,
               filter_ptr3 -= 25;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
-                               in_width, 5, 5, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input,
                 filter_ptr0 -= 25;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h
new file mode 100644
index 0000000000000000000000000000000000000000..154d74a849f38c5b114f70d897946a220a722d2c
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
@@ -0,0 +1,48 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK5x5S1 : public Conv2dBase {
+ public:
+  Conv2dK5x5S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc
similarity index 78%
rename from mace/ops/arm/conv_2d_neon_7x7.cc
rename to mace/ops/arm/fp32/conv_2d_7x7.cc
index 2411aad6761835970ad77e8cf980bd27f045d1e8..86d3e468f494bb42e3f5c3ecaf608adca72cea5a 100644
--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
-#endif
-
-#include "mace/ops/arm/conv_2d_neon.h"
+#include <memory>
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
 
 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
   /* load filter (4 outch x 1 height x 4 width) */ \
@@ -153,12 +153,40 @@ namespace ops {
   vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
   vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 2);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 2);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
+MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
+                                 const Tensor *input,
+                                 const Tensor *filter,
+                                 Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = in_shape[1] * in_image_size;
@@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input,
       const index_t in_channels = in_shape[1];
       const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
         float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
+            output_data + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
+            output_data + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
+            output_data + b * out_batch_size + (m + 3) * out_image_size;
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+              input_data + b * in_batch_size + c * in_image_size;
+          const float
+              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
           const float *filter_ptr1 =
-              filter + (m + 1) * in_channels * 49 + c * 49;
+              filter_data + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
-              filter + (m + 2) * in_channels * 49 + c * 49;
+              filter_data + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
-              filter + (m + 3) * in_channels * 49 + c * 49;
+              filter_data + (m + 3) * in_channels * 49 + c * 49;
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input,
               filter_ptr3 -= 49;
             }  // w
           }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
-                               in_width, 7, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 3);
-          }
-#endif
         }  // c
       } else {
         for (index_t mm = m; mm < out_channels; ++mm) {
           float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + mm * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input,
                 filter_ptr0 -= 49;
               }  // w
             }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
-                               out_height, out_width, out_ptr0_base, 3);
-#endif
           }  // c
         }    // mm
       }      // if
     }        // m
   }          // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
 }
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h
new file mode 100644
index 0000000000000000000000000000000000000000..e64780bab2bb4c22c2107da29d85b9040ef86460
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
@@ -0,0 +1,73 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dK7x7S1 : public Conv2dBase {
+ public:
+  Conv2dK7x7S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S2 : public Conv2dBase {
+ public:
+  Conv2dK7x7S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+class Conv2dK7x7S3 : public Conv2dBase {
+ public:
+  Conv2dK7x7S3(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK7x7S3() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a12c5d53b83c275a470f04accdeee07d65317330
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -0,0 +1,232 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include "mace/ops/arm/fp32/conv_general.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Conv2dGeneral::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+
+  ResizeOutAndPadInOut(context,
+                       input,
+                       filter,
+                       output,
+                       1,
+                       4,
+                       &padded_input,
+                       &padded_output);
+
+  const Tensor *in_tensor = input;
+  if (padded_input.get() != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output.get() != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  auto filter_data = filter->data<float>();
+  auto input_data = in_tensor->data<float>();
+  auto output_data = out_tensor->mutable_data<float>();
+
+  auto in_shape = in_tensor->shape();
+  auto out_shape = out_tensor->shape();
+  auto filter_shape = filter->shape();
+
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = filter_shape[1] * in_image_size;
+  const index_t out_batch_size = filter_shape[0] * out_image_size;
+  const index_t filter_size = filter_shape[2] * filter_shape[3];
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < in_shape[0]; b++) {
+    for (index_t m = 0; m < filter_shape[0]; m += 4) {
+      const index_t in_width = in_shape[3];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t out_channels = filter_shape[0];
+      const index_t in_channels = filter_shape[1];
+
+      const int stride_h = strides_[0];
+      const int stride_w = strides_[1];
+      const int dilation_h = dilations_[0];
+      const int dilation_w = dilations_[1];
+      if (m + 3 < out_channels) {
+        float *out_ptr0_base =
+            output_data + b * out_batch_size + m * out_image_size;
+        float *out_ptr1_base = out_ptr0_base + out_image_size;
+        float *out_ptr2_base = out_ptr1_base + out_image_size;
+        float *out_ptr3_base = out_ptr2_base + out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 =
+              filter_data + m * in_channels * filter_size + c * filter_size;
+          const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+          const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+          const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // input offset
+              index_t ih = h * stride_h;
+              index_t iw = w * stride_w;
+              index_t in_offset = ih * in_width + iw;
+              // output (4 outch x 1 height x 4 width): vo_outch_height
+              float vo0[4], vo1[4], vo2[4], vo3[4];
+              // load output
+              index_t out_offset = h * out_width + w;
+              for (index_t ow = 0; ow < 4; ++ow) {
+                vo0[ow] = out_ptr0_base[out_offset + ow];
+                vo1[ow] = out_ptr1_base[out_offset + ow];
+                vo2[ow] = out_ptr2_base[out_offset + ow];
+                vo3[ow] = out_ptr3_base[out_offset + ow];
+              }
+              // calc by row
+              for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                  // outch 0
+                  vo0[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr0[kw];
+                  // outch 1
+                  vo1[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr1[kw];
+                  // outch 2
+                  vo2[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr2[kw];
+                  // outch 3
+                  vo3[0] += in_ptr_base[in_offset
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[1] += in_ptr_base[in_offset + stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                  vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+                      + kw * dilation_w] * filter_ptr3[kw];
+                }  // kw
+
+                in_offset += dilation_h * in_width;
+                filter_ptr0 += filter_shape[3];
+                filter_ptr1 += filter_shape[3];
+                filter_ptr2 += filter_shape[3];
+                filter_ptr3 += filter_shape[3];
+              }  // kh
+
+              for (index_t ow = 0; ow < 4; ++ow) {
+                out_ptr0_base[out_offset + ow] = vo0[ow];
+                out_ptr1_base[out_offset + ow] = vo1[ow];
+                out_ptr2_base[out_offset + ow] = vo2[ow];
+                out_ptr3_base[out_offset + ow] = vo3[ow];
+              }
+
+              filter_ptr0 -= filter_size;
+              filter_ptr1 -= filter_size;
+              filter_ptr2 -= filter_size;
+              filter_ptr3 -= filter_size;
+            }  // w
+          }  // h
+        }  // c
+      } else {
+        for (index_t mm = m; mm < out_channels; ++mm) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + mm * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float *filter_ptr0 =
+                filter_data + mm * in_channels * filter_size + c * filter_size;
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // input offset
+                index_t ih = h * stride_h;
+                index_t iw = w * stride_w;
+                index_t in_offset = ih * in_width + iw;
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float vo0[4];
+                // load output
+                index_t out_offset = h * out_width + w;
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  vo0[ow] = out_ptr0_base[out_offset + ow];
+                }
+
+                // calc by row
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                    // outch 0
+                    vo0[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                  }  // kw
+
+                  in_offset += dilation_h * in_width;
+                  filter_ptr0 += filter_shape[3];
+                }  // kh
+
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  out_ptr0_base[out_offset + ow] = vo0[ow];
+                }
+                filter_ptr0 -= filter_size;
+              }  // w
+            }  // h
+          }  // c
+        }  // mm
+      }  // if
+    }  // m
+  }  // b
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h
new file mode 100644
index 0000000000000000000000000000000000000000..01d019548a19fee9c79deb6d918dac9431110fac
--- /dev/null
+++ b/mace/ops/arm/fp32/conv_general.h
@@ -0,0 +1,50 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
+
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  Conv2dGeneral(const std::vector<int> strides,
+                const std::vector<int> dilations,
+                const std::vector<int> paddings,
+                const Padding padding_type)
+      : Conv2dBase(strides, dilations, paddings, padding_type) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h
index f4cfc42bb199161a877ab0329670004ef94a6b97..ce226c1a341d76d7f873cb527408688c2e538a8c 100644
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
 #include "mace/ops/common/matrix.h"
+#include "mace/utils/math.h"
 
 // This implements matrix-matrix multiplication.
 // In the case of matrix-vector multiplication, use gemv.h/gemv.cc instead
diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc
index 703e39449663a66d8076d7b2500a9820c209938c..cd0f607fd63f16bb5c99ea0a369dc8423a6bf358 100644
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -18,6 +18,8 @@
 #include <arm_neon.h>
 #include <algorithm>
 
+#include "mace/utils/math.h"
+
 #if !defined(__aarch64__)
 float vaddvq_f32(float32x4_t v) {
   float32x2_t _sum = vadd_f32(vget_low_f32(v), vget_high_f32(v));
@@ -258,11 +260,12 @@ MaceStatus Gemv::Compute(const OpContext *context,
           ++rhs_ptr;
         }
 
-        float32x4_t vbias = vdupq_n_f32(0);
         if (bias) {
+          float32x4_t vbias = vdupq_n_f32(0);
           vbias = vld1q_f32(bias_data + h_start);
+          vo = vaddq_f32(vo, vbias);
         }
-        vo = vaddq_f32(vo, vbias);
+
         vst1q_f32(ret_ptr, vo);
       } else {  // h_block_len < 4
 #endif  // MACE_GEMV_UNROLL
diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/fp32/gemv.h
index 3210def1dd50ecc5e4c45dbda0d4da67df55ee8e..1f406426fbe93ae965f23450eca2a5ba1c517db1 100644
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f987da81373282f769f660e5f10e7795413b3be4
--- /dev/null
+++ b/mace/ops/arm/q8/eltwise.cc
@@ -0,0 +1,157 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/q8/eltwise.h"
+
+#include <arm_neon.h>
+#include <algorithm>
+
+#include "mace/ops/common/gemmlowp_util.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+MaceStatus Eltwise::Compute(const OpContext *context,
+                            const Tensor *input0,
+                            const Tensor *input1,
+                            Tensor *output) {
+  MACE_UNUSED(context);
+  MACE_CHECK(type_ == SUM || type_ == SUB,
+             "Quantized Elementwise only support SUM and SUB now.");
+
+  constexpr int left_shift = 20;
+  const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
+  const double adjusted_input0_scale = input0->scale() / doubled_scale;
+  const double adjusted_input1_scale = input1->scale() / doubled_scale;
+  const double adjusted_output_scale =
+      doubled_scale / ((1 << left_shift) * output->scale());
+
+  int32_t input0_multiplier;
+  int32_t input1_multiplier;
+  int32_t output_multiplier;
+  int32_t input0_shift;
+  int32_t input1_shift;
+  int32_t output_shift;
+  QuantizeMultiplier(adjusted_input0_scale,
+                     &input0_multiplier,
+                     &input0_shift);
+  QuantizeMultiplier(adjusted_input1_scale,
+                     &input1_multiplier,
+                     &input1_shift);
+  QuantizeMultiplier(adjusted_output_scale,
+                     &output_multiplier,
+                     &output_shift);
+
+  Tensor::MappingGuard input0_guard(input0);
+  Tensor::MappingGuard input1_guard(input1);
+  Tensor::MappingGuard output_guard(output);
+
+  auto input0_ptr = input0->data<uint8_t>();
+  auto input1_ptr = input1->data<uint8_t>();
+  auto output_ptr = output->mutable_data<uint8_t>();
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i <= output->size() - 8; i += 8) {
+    const auto input0_val = vld1_u8(input0_ptr + i);
+    const auto input1_val = vld1_u8(input1_ptr + i);
+    const auto input0_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input0_val));
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val));
+    const auto offset_input0 =
+        vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
+    const auto offset_input1 =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
+    auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
+    auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
+    auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
+    auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
+    input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
+    input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
+    input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
+    input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
+    const auto input0_shift_dup = vdupq_n_s32(input0_shift);
+    const auto input1_shift_dup = vdupq_n_s32(input1_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
+    int32x4_t res_low, res_high;
+    if (type_ == SUM) {
+      res_low = vaddq_s32(input0_low_s32, input1_low_s32);
+      res_high = vaddq_s32(input0_high_s32, input1_high_s32);
+    } else {
+      res_low = vsubq_s32(input0_low_s32, input1_low_s32);
+      res_high = vsubq_s32(input0_high_s32, input1_high_s32);
+    }
+    res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
+    res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
+    res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
+    res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
+    const auto res_low_s16 = vmovn_s32(res_low);
+    const auto res_high_s16 = vmovn_s32(res_high);
+    const auto output_val = vaddq_s16(vcombine_s16(res_low_s16,
+                                                   res_high_s16),
+                                      vdupq_n_s16(output->zero_point()));
+    vst1_u8(output_ptr + i, vqmovun_s16(output_val));
+  }
+
+  index_t handled_output_size = output->size() - output->size() % 8;
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = handled_output_size; i < output->size(); ++i) {
+    const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+    const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+    const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+    const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+    const int32_t multiplied_input0 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                        input0_multiplier),
+            -input0_shift);
+    const int32_t multiplied_input1 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                        input1_multiplier),
+            -input1_shift);
+
+    int32_t res;
+    if (type_ == SUM) {
+      res = multiplied_input0 + multiplied_input1;
+    } else {
+      res = multiplied_input0 - multiplied_input1;
+    }
+
+    const int32_t output_val =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(res,
+                                                        output_multiplier),
+            -output_shift) + output->zero_point();
+    output_ptr[i] = Saturate<uint8_t>(output_val);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..200b13cb2769787a92c2d03da40f1b2e10d65900
--- /dev/null
+++ b/mace/ops/arm/q8/eltwise.h
@@ -0,0 +1,48 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This implements matrix-vector multiplication described as
+// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
+
+#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
+#define MACE_OPS_ARM_Q8_ELTWISE_H_
+
+#include "mace/core/op_context.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/eltwise_type.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+class Eltwise {
+ public:
+  explicit Eltwise(const EltwiseType type) : type_(type) {}
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input0,
+                     const Tensor *input1,
+                     Tensor *output);
+
+ private:
+  EltwiseType type_;
+};
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_Q8_ELTWISE_H_
diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc
index 790a1448a138074105cc5d710e7c327fb5bf1f14..ce102e7e3171ff3344b4535576c9187866305fcd 100644
--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
@@ -18,14 +18,12 @@
 #include <arm_neon.h>
 #include <algorithm>
 
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 #include "mace/utils/quantize.h"
 
 #if !defined(__aarch64__)
 
-#define vmlal_high_s16(c, a, b) vmlal_s16(c, vget_high_s16(a), vget_high_s16(b))
-
-#define vaddvq_s32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
+#define vaddvq_u32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
 
 #endif
 
@@ -47,17 +45,19 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
                                       Tensor *output) {
   MACE_UNUSED(context);
 
-  bool is_output_type_uint8 =
-      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8;
   Tensor::MappingGuard lhs_guard(lhs);
   Tensor::MappingGuard rhs_guard(rhs);
   Tensor::MappingGuard bias_guard(bias);
   Tensor::MappingGuard output_guard(output);
 
+  const auto *lhs_data = lhs->data<uint8_t>();
+  const auto *rhs_data = rhs->data<uint8_t>();
+  OUTPUT_TYPE *output_data = output->mutable_data<OUTPUT_TYPE>();
+
   float output_multiplier_float = 0.0;
   int32_t output_multiplier = 0;
   int32_t output_shift = 0;
-  if (is_output_type_uint8) {
+  if (is_output_type_uint8_) {
     MACE_CHECK(output->scale() > 0, "output scale must not be zero");
     output_multiplier_float = lhs->scale() * rhs->scale() / output->scale();
     GetOutputMultiplierAndShift(lhs->scale(),
@@ -66,393 +66,110 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
                                 &output_multiplier,
                                 &output_shift);
   }
-  const index_t h_block_size = 4;
-  const index_t h_block_count = RoundUpDiv(lhs_height, h_block_size);
 
-#pragma omp parallel for collapse(2) schedule(runtime)
+  const int32_t lhs_zero_point = lhs->zero_point();
+  const int32_t rhs_zero_point = rhs->zero_point();
+
+  const index_t w_block_size = 16;
+  const index_t w_block_count = lhs_width / w_block_size;
+  const index_t w_block_remain = lhs_width - w_block_size * w_block_count;
+
   for (index_t b = 0; b < batch; ++b) {
-    for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) {
-      // TODO(liyin): it can be put it outside the loop,
-      // but openmp limits param count
-      const index_t w_block_size = 16;
-      const index_t w_block_count = lhs_width / w_block_size;
-      const index_t w_remain = lhs_width - w_block_size * w_block_count;
-
-      uint8_t lhs_zero_point = static_cast<uint8_t>(lhs->zero_point());
-      uint8_t rhs_zero_point = static_cast<uint8_t>(rhs->zero_point());
-
-      const uint8_t *lhs_data = lhs->data<uint8_t>();
-      const uint8_t *rhs_data = rhs->data<uint8_t>();
-      const int32_t *bias_data = nullptr;
-      if (bias) {
-        bias_data = bias->data<int32_t>();
+    const uint8_t *rhs_base =
+        rhs_data + static_cast<index_t>(rhs_batched) * b * lhs_width;
+    uint32_t sum_rhs = 0;
+    for (index_t i = 0; i < lhs_width; ++i) {
+      sum_rhs += static_cast<uint32_t>(rhs_base[i]);
+    }
+
+#pragma omp parallel for schedule(runtime)
+    for (index_t h = 0; h < lhs_height; ++h) {
+      const uint8_t *lhs_ptr = lhs_data
+          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+          + h * lhs_width;
+      const uint8_t *rhs_ptr = rhs_base;
+      OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h;
+
+      uint32_t dot = 0;
+      uint32_t sum_lhs = 0;
+      uint32x4_t vo0_high_u32 = vdupq_n_u32(0);
+      uint32x4_t vo0_low_u32 = vdupq_n_u32(0);
+      uint32x4_t vo1_high_u32 = vdupq_n_u32(0);
+      uint32x4_t vo1_low_u32 = vdupq_n_u32(0);
+      uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0);
+      uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0);
+
+      for (index_t w_block_idx = 0; w_block_idx < w_block_count;
+           ++w_block_idx) {
+        uint8x8_t vl0_u8 = vld1_u8(lhs_ptr);
+        uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8);
+
+        uint8x8_t vr0_u8 = vld1_u8(rhs_ptr);
+        uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8);
+
+        uint16x8_t vl0_u16 = vmovl_u8(vl0_u8);
+        uint16x8_t vl1_u16 = vmovl_u8(vl1_u8);
+
+        uint16x8_t vr0_u16 = vmovl_u8(vr0_u8);
+        uint16x8_t vr1_u16 = vmovl_u8(vr1_u8);
+
+        vo0_high_u32 = vmlal_u16(vo0_high_u32,
+                                 vget_high_u16(vl0_u16),
+                                 vget_high_u16(vr0_u16));
+        vo0_low_u32 = vmlal_u16(vo0_low_u32,
+                                vget_low_u16(vl0_u16),
+                                vget_low_u16(vr0_u16));
+        vo1_high_u32 = vmlal_u16(vo1_high_u32,
+                                 vget_high_u16(vl1_u16),
+                                 vget_high_u16(vr1_u16));
+        vo1_low_u32 = vmlal_u16(vo1_low_u32,
+                                vget_low_u16(vl1_u16),
+                                vget_low_u16(vr1_u16));
+
+        // It can be precuculated if lhs is const, but for this case
+        // computation is not bottleneck
+        sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16),
+                                      vget_high_u16(vl1_u16));
+        sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16),
+                                     vget_low_u16(vl1_u16));
+
+        lhs_ptr += 16;
+        rhs_ptr += 16;
       }
-      OUTPUT_TYPE *output_data = output->mutable_data<OUTPUT_TYPE>();
 
-      int32x4_t voutput_multiplier = vdupq_n_s32(output_multiplier);
-      int32x4_t voutput_shift_left = vdupq_n_s32(-output_shift);
+      vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32);
+      vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32);
+      vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32);
+      dot += vaddvq_u32(vo0_low_u32);
 
-      uint8x8_t
-          vlhs_zero_point = vdup_n_u8(lhs_zero_point);
-      uint8x8_t
-          vrhs_zero_point = vdup_n_u8(rhs_zero_point);
+      sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32);
+      sum_lhs = vaddvq_u32(sum_lhs_low_u32);
 
-      const uint8_t
-          *lhs_ptr = lhs_data
-          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
-          + lhs_width * h_block_idx * h_block_size;
-      const uint8_t *rhs_ptr =
-          rhs_data + static_cast<index_t>(rhs_batched) * b * lhs_width;
-      OUTPUT_TYPE
-          *ret_ptr = output_data + b * lhs_height + h_block_idx * h_block_size;
-
-      const index_t h_block_len =
-          std::min(h_block_size, lhs_height - h_block_idx * h_block_size);
-      const index_t h_offset = h_block_idx * h_block_size;
-
-      if (h_block_len == 4) {
-        int32x4_t vo0 = vdupq_n_s32(0);
-        int32x4_t vo1 = vdupq_n_s32(0);
-        int32x4_t vo2 = vdupq_n_s32(0);
-        int32x4_t vo3 = vdupq_n_s32(0);
-
-        index_t r_w_block_count = w_block_count;
-        // just make compiler happy
-        MACE_UNUSED(r_w_block_count);
-
-        // Register layout: (4x16) x (16x1)
-        //
-        //                                                 +----+
-        //                                                 |d16 |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                         Rhs     +----+
-        //                                                 |d17 |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 +----+
-        //                                                 |d18 |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 +----+
-        //                                                 |d19 |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 | .  |
-        //                                                 +----+
-        //
-        //                                                 |    |
-        //
-        //                      Lhs                        |    |
-        //
-        //  +--------+--------+--------+--------+ - - - -  +----+
-        //  | d0 ... | d1 ... | d2 ... | d3 ... |          |vo0 |
-        //  | d4 ... | d5 ... | d6 ... | d7 ... |          |vo1 |
-        //  | d8 ... | d9 ... | d10... | d11... |          |vo2 |
-        //  | d12... | d13... | d14... | d15... |          |vo3 |
-        //  +--------+--------+--------+--------+ - - - -  +----+
-        //
-        //                                               Accumulator
-        //
-
-#if not defined(__aarch64__)
-        asm volatile(
-        "cmp %[r_w_block_count], #0\n"
-        "beq 0f\n"
-
-        "mov r0, %[rhs_ptr]\n"
-        "mov r1, %[lhs_ptr]\n"
-        "add r2, r1, %[lhs_width]\n"
-        "add r3, r2, %[lhs_width]\n"
-        "add r4, r3, %[lhs_width]\n"
-
-        "vdup.u8 d20, %[rhs_zero_point]\n"
-        "vdup.u8 d21, %[lhs_zero_point]\n"
-
-        // prelogue
-        "vld1.8 d16, [r0]!\n"
-        "vld1.8 d18, [r0]!\n"
-
-        "vld1.8 d0, [r1]!\n"
-        "vld1.8 d2, [r1]!\n"
-        "vld1.8 d4, [r2]!\n"
-        "vld1.8 d6, [r2]!\n"
-        "vld1.8 d8, [r3]!\n"
-        "vld1.8 d10, [r3]!\n"
-        "vld1.8 d12, [r4]!\n"
-        "vld1.8 d14, [r4]!\n"
-
-        "subs %[r_w_block_count], #1\n"
-        "beq 1f\n"
-
-        "2: \n"
-        "vsubl.u8 q8, d16, d20\n"
-        "vsubl.u8 q9, d18, d20\n"
-
-        "vsubl.u8 q0, d0, d21\n"
-        "vsubl.u8 q1, d2, d21\n"
-        "vsubl.u8 q2, d4, d21\n"
-        "vsubl.u8 q3, d6, d21\n"
-        "vsubl.u8 q4, d8, d21\n"
-        "vsubl.u8 q5, d10, d21\n"
-        "vsubl.u8 q6, d12, d21\n"
-        "vsubl.u8 q7, d14, d21\n"
-
-        "vmlal.s16 %q[vo0], d0, d16\n"
-        "vmlal.s16 %q[vo1], d4, d16\n"
-        "vmlal.s16 %q[vo2], d8, d16\n"
-        "vmlal.s16 %q[vo3], d12, d16\n"
-
-        "vld1.8 d0, [r1]!\n"
-        "vld1.8 d4, [r2]!\n"
-        "vld1.8 d8, [r3]!\n"
-        "vld1.8 d12, [r4]!\n"
-        "vld1.8 d16, [r0]!\n"
-
-        "vmlal.s16 %q[vo0], d2, d18\n"
-        "vmlal.s16 %q[vo1], d6, d18\n"
-        "vmlal.s16 %q[vo2], d10, d18\n"
-        "vmlal.s16 %q[vo3], d14, d18\n"
-
-        "vld1.8 d2, [r1]!\n"
-        "vld1.8 d6, [r2]!\n"
-        "vld1.8 d10, [r3]!\n"
-        "vld1.8 d14, [r4]!\n"
-        "vld1.8 d18, [r0]!\n"
-
-        "vmlal.s16 %q[vo0], d1, d17\n"
-        "vmlal.s16 %q[vo1], d5, d17\n"
-        "vmlal.s16 %q[vo2], d9, d17\n"
-        "vmlal.s16 %q[vo3], d13, d17\n"
-
-        "subs %[r_w_block_count], #1\n"
-        "vmlal.s16 %q[vo0], d3, d19\n"
-        "vmlal.s16 %q[vo1], d7, d19\n"
-        "vmlal.s16 %q[vo2], d11, d19\n"
-        "vmlal.s16 %q[vo3], d15, d19\n"
-
-        "bne 2b\n"
-
-        // prologue
-        "1:\n"
-        "vsubl.u8 q8, d16, d20\n"
-        "vsubl.u8 q9, d18, d20\n"
-
-        "vsubl.u8 q0, d0, d21\n"
-        "vsubl.u8 q1, d2, d21\n"
-        "vsubl.u8 q2, d4, d21\n"
-        "vsubl.u8 q3, d6, d21\n"
-        "vsubl.u8 q4, d8, d21\n"
-        "vsubl.u8 q5, d10, d21\n"
-        "vsubl.u8 q6, d12, d21\n"
-        "vsubl.u8 q7, d14, d21\n"
-
-        "vmlal.s16 %q[vo0], d0, d16\n"
-        "vmlal.s16 %q[vo1], d4, d16\n"
-        "vmlal.s16 %q[vo2], d8, d16\n"
-        "vmlal.s16 %q[vo3], d12, d16\n"
-
-        "vmlal.s16 %q[vo0], d1, d17\n"
-        "vmlal.s16 %q[vo1], d5, d17\n"
-        "vmlal.s16 %q[vo2], d9, d17\n"
-        "vmlal.s16 %q[vo3], d13, d17\n"
-
-        "vmlal.s16 %q[vo0], d2, d18\n"
-        "vmlal.s16 %q[vo1], d6, d18\n"
-        "vmlal.s16 %q[vo2], d10, d18\n"
-        "vmlal.s16 %q[vo3], d14, d18\n"
-
-        "vmlal.s16 %q[vo0], d3, d19\n"
-        "vmlal.s16 %q[vo1], d7, d19\n"
-        "vmlal.s16 %q[vo2], d11, d19\n"
-        "vmlal.s16 %q[vo3], d15, d19\n"
-
-        "0:\n"
-        :  // outputs
-        [vo0] "+w"(vo0),
-        [vo1] "+w"(vo1),
-        [vo2] "+w"(vo2),
-        [vo3] "+w"(vo3),
-        [r_w_block_count] "+r"(r_w_block_count)
-        :  // inputs
-        [lhs_ptr] "r"(lhs_ptr), [rhs_ptr] "r"(rhs_ptr),
-        [lhs_width] "r"(lhs_width),
-        [lhs_zero_point] "r"(lhs_zero_point),
-        [rhs_zero_point] "r"(rhs_zero_point)
-        :  // clobbers
-        "cc", "memory", "r0", "r1", "r2", "r3", "r4",
-        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
-        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
-        "d21");
-
-        lhs_ptr += w_block_count * w_block_size;
-        rhs_ptr += w_block_count * w_block_size;
-#else
-        for (index_t w_block_index = 0; w_block_index < w_block_count;
-             ++w_block_index) {
-          uint8x8_t vr0 = vld1_u8(rhs_ptr);
-          int16x8_t
-              vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point));
-          uint8x8_t vr0n = vld1_u8(rhs_ptr + 8);
-          int16x8_t
-              vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point));
-
-          uint8x8_t vl0 = vld1_u8(lhs_ptr);
-          int16x8_t
-              vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point));
-          uint8x8_t vl0n = vld1_u8(lhs_ptr + 8);
-          int16x8_t
-              vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point));
-
-          vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0));
-          vo0 = vmlal_high_s16(vo0, vxl0, vxr0);
-          vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n));
-          vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n);
-
-          const uint8_t *lhs_ptr1 = lhs_ptr + lhs_width;
-
-          uint8x8_t vl1 = vld1_u8(lhs_ptr1);
-          int16x8_t
-              vxl1 = vreinterpretq_s16_u16(vsubl_u8(vl1, vlhs_zero_point));
-          uint8x8_t vl1n = vld1_u8(lhs_ptr1 + 8);
-          int16x8_t
-              vxl1n = vreinterpretq_s16_u16(vsubl_u8(vl1n, vlhs_zero_point));
-
-          vo1 = vmlal_s16(vo1, vget_low_s16(vxl1), vget_low_s16(vxr0));
-          vo1 = vmlal_high_s16(vo1, vxl1, vxr0);
-          vo1 = vmlal_s16(vo1, vget_low_s16(vxl1n), vget_low_s16(vxr0n));
-          vo1 = vmlal_high_s16(vo1, vxl1n, vxr0n);
-
-          const uint8_t *lhs_ptr2 = lhs_ptr1 + lhs_width;
-
-          uint8x8_t vl2 = vld1_u8(lhs_ptr2);
-          int16x8_t
-              vxl2 = vreinterpretq_s16_u16(vsubl_u8(vl2, vlhs_zero_point));
-          uint8x8_t vl2n = vld1_u8(lhs_ptr2 + 8);
-          int16x8_t
-              vxl2n = vreinterpretq_s16_u16(vsubl_u8(vl2n, vlhs_zero_point));
-
-          vo2 = vmlal_s16(vo2, vget_low_s16(vxl2), vget_low_s16(vxr0));
-          vo2 = vmlal_high_s16(vo2, vxl2, vxr0);
-          vo2 = vmlal_s16(vo2, vget_low_s16(vxl2n), vget_low_s16(vxr0n));
-          vo2 = vmlal_high_s16(vo2, vxl2n, vxr0n);
-
-          const uint8_t *lhs_ptr3 = lhs_ptr2 + lhs_width;
-
-          uint8x8_t vl3 = vld1_u8(lhs_ptr3);
-          int16x8_t
-              vxl3 = vreinterpretq_s16_u16(vsubl_u8(vl3, vlhs_zero_point));
-          uint8x8_t vl3n = vld1_u8(lhs_ptr3 + 8);
-          int16x8_t
-              vxl3n = vreinterpretq_s16_u16(vsubl_u8(vl3n, vlhs_zero_point));
-
-          vo3 = vmlal_s16(vo3, vget_low_s16(vxl3), vget_low_s16(vxr0));
-          vo3 = vmlal_high_s16(vo3, vxl3, vxr0);
-          vo3 = vmlal_s16(vo3, vget_low_s16(vxl3n), vget_low_s16(vxr0n));
-          vo3 = vmlal_high_s16(vo3, vxl3n, vxr0n);
-
-          lhs_ptr += 16;
-          rhs_ptr += 16;
-        }
-#endif  // __aarch64__
-        int32x4_t vo = {vaddvq_s32(vo0),
-                        vaddvq_s32(vo1),
-                        vaddvq_s32(vo2),
-                        vaddvq_s32(vo3)};
-
-        for (index_t w = 0; w < w_remain; ++w) {
-          vo[0] +=
-              (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point);
-          vo[1] += (lhs_ptr[lhs_width] - lhs_zero_point)
-              * (rhs_ptr[0] - rhs_zero_point);
-          vo[2] += (lhs_ptr[lhs_width * 2] - lhs_zero_point)
-              * (rhs_ptr[0] - rhs_zero_point);
-          vo[3] += (lhs_ptr[lhs_width * 3] - lhs_zero_point)
-              * (rhs_ptr[0] - rhs_zero_point);
-          ++lhs_ptr;
-          ++rhs_ptr;
-        }
-
-        int32x4_t vbias = vdupq_n_s32(0);
-        if (bias) {
-          vbias = vld1q_s32(bias_data + h_offset);
-        }
-        vo = vaddq_s32(vo, vbias);
-
-        if (is_output_type_uint8) {
-          int32x4_t vo_mul = vqrdmulhq_s32(vo, voutput_multiplier);
-          int32x4_t
-              fixup = vshrq_n_s32(vandq_s32(vo_mul, voutput_shift_left), 31);
-          int32x4_t fixed_up_x = vqaddq_s32(vo_mul, fixup);
-          int32x4_t
-              vo_rescale_int32 = vrshlq_s32(fixed_up_x, voutput_shift_left);
-
-          int16x4_t vo_rescale_int16 = vqmovn_s32(vo_rescale_int32);
-          uint8x8_t vo_rescale_uint8 =
-              vqmovun_s16(vcombine_s16(vo_rescale_int16, vo_rescale_int16));
-
-          ret_ptr[0] = vo_rescale_uint8[0];
-          ret_ptr[1] = vo_rescale_uint8[1];
-          ret_ptr[2] = vo_rescale_uint8[2];
-          ret_ptr[3] = vo_rescale_uint8[3];
-        } else {
-          ret_ptr[0] = vo[0];
-          ret_ptr[1] = vo[1];
-          ret_ptr[2] = vo[2];
-          ret_ptr[3] = vo[3];
-        }
-      } else {  // h_block_len < 4
-        // TODO(liyin): handle here case by case (1,2,3) to accelerate
-        const uint8_t *tmp_lhs_ptr = lhs_ptr;
-        const uint8_t *tmp_rhs_ptr = rhs_ptr;
-        for (index_t h = 0; h < h_block_len; ++h) {
-          lhs_ptr = tmp_lhs_ptr + h * lhs_width;
-          rhs_ptr = tmp_rhs_ptr;
-          int32x4_t vo0 = vdupq_n_s32(0);
-          for (index_t w = 0; w < w_block_count; ++w) {
-            uint8x8_t vr0 = vld1_u8(rhs_ptr);
-            int16x8_t
-                vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point));
-            uint8x8_t vr0n = vld1_u8(rhs_ptr + 8);
-            int16x8_t
-                vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point));
-
-            uint8x8_t vl0 = vld1_u8(lhs_ptr);
-            int16x8_t
-                vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point));
-            uint8x8_t vl0n = vld1_u8(lhs_ptr + 8);
-            int16x8_t
-                vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point));
-
-            vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0));
-            vo0 = vmlal_high_s16(vo0, vxl0, vxr0);
-            vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n));
-            vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n);
-
-            lhs_ptr += 16;
-            rhs_ptr += 16;
-          }  // w
-          int32_t s0 = vaddvq_s32(vo0) + (bias ? bias_data[h_offset + h] : 0);
-          for (index_t w = 0; w < w_remain; ++w) {
-            s0 += (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point);
-            ++lhs_ptr;
-            ++rhs_ptr;
-          }  // w
-
-          if (is_output_type_uint8) {
-            ret_ptr[h] =
-                Saturate<uint8_t>(std::roundf(s0 * output_multiplier_float));
-          } else {
-            ret_ptr[h] = s0;
-          }
-        }  // h
-      }  // if
-    }  // h_block_idx
+      for (index_t w = 0; w < w_block_remain; ++w) {
+        dot += (*lhs_ptr) * (*rhs_ptr);
+        sum_lhs += (*lhs_ptr);
+        ++lhs_ptr;
+        ++rhs_ptr;
+      }
+
+      const auto zero_point_dot =
+          static_cast<int32_t>(lhs_zero_point * rhs_zero_point * lhs_width);
+      int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point
+          + zero_point_dot;
+      if (bias) {
+        ret += bias->data<int32_t>()[h];
+      }
+
+      if (is_output_type_uint8_) {
+        *output_ptr =
+            Saturate<uint8_t>(std::roundf(ret * output_multiplier_float));
+      } else {
+        *output_ptr = ret;
+      }
+    }  // h
   }  // b
 
+
   return MaceStatus::MACE_SUCCESS;
 }
 
@@ -466,7 +183,6 @@ class Gemv<int32_t>;
 }  // namespace ops
 }  // namespace mace
 
-#if defined(vmlal_high_s16)
-#undef vmlal_high_s16
-#undef vaddvq_s32
-#endif
+#ifdef vaddvq_u32
+#undef vaddvq_u32
+#endif  // vaddvq_u32
diff --git a/mace/ops/arm/q8/gemv.h b/mace/ops/arm/q8/gemv.h
index adcb9590ebeff38eb8409ec49eb13a84044f64d8..21a275798a7dd9533c1645d606386aa89cf91a92 100644
--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,7 +30,9 @@ namespace q8 {
 template<typename OUTPUT_TYPE>
 class Gemv {
  public:
-  Gemv() {}
+  Gemv() : is_output_type_uint8_(
+      DataTypeToEnum<OUTPUT_TYPE>::value == DataType::DT_UINT8) {
+  }
   ~Gemv() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -44,6 +46,9 @@ class Gemv {
       const bool lhs_batched,
       const bool rhs_batched,
       Tensor *output);
+
+ private:
+  bool is_output_type_uint8_;
 };
 
 }  // namespace q8
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index ee44ec59f7c329215f3a5ba95c8a6bf6e18f6399..469efe2e0c5eaac299d2622931a5e36154973d8e 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -22,6 +22,7 @@
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -156,8 +157,8 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::BatchNormKernel<T>(
-          epsilon, activation, relux_max_limit, leakyrelu_coefficient));
+      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
+          epsilon, activation, relux_max_limit, leakyrelu_coefficient);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 11bf4f6e74dbfd7141963add244c50f8b9b1ff35..74f7a013c14af8294aaabcddf5a7a29d8662edf1 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 8d1e463c56b3510901d42d5d4370273d252ecbf2..cfd350d458429ea86a68e9176c41108e2469f392 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/batch_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -266,7 +267,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
   explicit BatchToSpaceNDOp(OpConstructContext *context)
       : BatchToSpaceOpBase(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
+      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 3552a0a31289cbb070bd761644d5711530ea3b80..a8883e1431205f46e5abbb2a78f4b45d8537cec7 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -22,6 +22,7 @@
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -34,8 +35,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit BiasAddOp(OpConstructContext *context)
       : Operation(context),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-                     "data_format", NHWC))) {}
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
+  {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -56,7 +57,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
     const float *bias_ptr = bias->data<float>();
     float *output_ptr = output->mutable_data<float>();
 
-    if (input->dim_size() == 4 && data_format_ == NCHW) {
+    if (input->dim_size() == 4 && has_data_format_) {
       const index_t batch = input->dim(0);
       const index_t channels = input->dim(1);
       const index_t height_width = input->dim(2) * input->dim(3);
@@ -89,7 +90,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
   }
 
  private:
-  DataFormat data_format_;
+  int has_data_format_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -98,12 +99,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit BiasAddOp(OpConstructContext *context)
       : Operation(context),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", NHWC))) {
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::BiasAddKernel<T>);
+      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -120,13 +120,13 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
 
     Tensor *output = this->Output(0);
     MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
+    MACE_CHECK(input->dim_size() == 4 && has_data_format_,
                "gpu only support biasadd for 4-dimensional NHWC format tensor");
     return kernel_->Compute(context, input, bias, output);
   }
 
  private:
-  DataFormat data_format_;
+  int has_data_format_;
   std::unique_ptr<OpenCLBiasAddKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 920a478f7202d6af7bef000ea4693cc8aa67c292..7de89dd2296829390eb1964911af5378c6edf9cc 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -42,7 +42,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
   OpDefBuilder("BiasAdd", "BiasAddBM")
       .Input("Input")
       .Input("Bias")
-      .AddIntArg("data_format", data_format)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index 92b918592f984692ccaed7744bb4f4cc9fb3a17e..2e4764cac8ad2cf1f303a2e53c64fda444023fa3 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -36,7 +36,7 @@ void BiasAddSimple() {
     OpDefBuilder("BiasAdd", "BiasAddTest")
         .Input("InputNCHW")
         .Input("Bias")
-        .AddIntArg("data_format", NCHW)
+        .AddIntArg("has_data_format", 1)
         .Output("OutputNCHW")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
   OpDefBuilder("BiasAdd", "BiasAddTest")
       .Input("InputNCHW")
       .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("has_data_format", 1)
       .Output("OutputNCHW")
       .Finalize(net.NewOperatorDef());
 
@@ -139,7 +139,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   OpDefBuilder("BiasAdd", "BiasAddTest")
       .Input("InputNCHW")
       .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("has_data_format", 1)
       .Output("OutputNCHW")
       .Finalize(net.NewOperatorDef());
 
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index d4e687b846f340cddbaf4cd3b50854f326b6eb44..8249c344bb4c7fed189aeae4afee3f42fce6c70c 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -50,8 +50,6 @@ void FilterBufferToImage(int iters,
                    b2i_output);
   };
 
-  // Warm-up
-  net.Setup(D);
   for (int i = 0; i < 5; ++i) {
     transform_func();
   }
diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc
index 15f6e7d323e7885f779a015d99403e9ed7fc6f2d..229d4eb9657432f7966368da759cb0b497972ee9 100644
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -39,14 +39,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
     auto type =
         static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
             "buffer_type", static_cast<int>(CONV2D_FILTER)));
-    auto data_format = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
+    bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0)
+        != 0;
 
     MemoryType in_mem_type = context->workspace()->GetTensor(
         operator_def_->input(0))->memory_type();
     return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
         context, input, type, out_mem_type_, wino_blk_size_,
-        data_format, output);
+        has_data_format, output);
   }
 
  private:
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 57607755cc034f364d07660924d6481e3d79793b..70e1811a07292af8eb0982caf46decb393f28325 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -18,6 +18,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/channel_shuffle.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -83,7 +84,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     const int groups = Operation::GetOptionalArg<int>("group", 1);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
+      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc
index 8634cf2cb8333d03a97b131692c84d5f5249cab5..ade33c59002d3924123eede8687269de3abb2119 100644
--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -24,7 +24,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
                               const index_t *filter_shape,
-                              const FilterDataFormat filter_format,
+                              const DataFormat filter_format,
                               const int *dilations,
                               const int *strides,
                               Padding padding,
@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
 void CalcOutputSize(const index_t *input_shape,
                     const DataFormat input_format,
                     const index_t *filter_shape,
-                    const FilterDataFormat filter_format,
+                    const DataFormat filter_format,
                     const int *padding_size,
                     const int *dilations,
                     const int *strides,
diff --git a/mace/ops/common/conv_pool_2d_util.h b/mace/ops/common/conv_pool_2d_util.h
index db359ee92b02a88c48555ada851047f3ebe7f2e5..e8d0d335f1e0900cf1c265817cbcd73dd63c66b3 100644
--- a/mace/ops/common/conv_pool_2d_util.h
+++ b/mace/ops/common/conv_pool_2d_util.h
@@ -35,7 +35,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
                               const index_t *filter_shape,
-                              const FilterDataFormat filter_format,
+                              const DataFormat filter_format,
                               const int *dilations,
                               const int *strides,
                               Padding padding,
@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
 void CalcOutputSize(const index_t *input_shape,
                     const DataFormat input_format,
                     const index_t *filter_shape,
-                    const FilterDataFormat filter_format,
+                    const DataFormat filter_format,
                     const int *padding_size,
                     const int *dilations,
                     const int *strides,
diff --git a/mace/ops/common/eltwise_type.h b/mace/ops/common/eltwise_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..634c4919c18f221b255939a01d8411428b8f3476
--- /dev/null
+++ b/mace/ops/common/eltwise_type.h
@@ -0,0 +1,40 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_
+#define MACE_OPS_COMMON_ELTWISE_TYPE_H_
+
+namespace mace {
+namespace ops {
+
+enum EltwiseType {
+  SUM = 0,
+  SUB = 1,
+  PROD = 2,
+  DIV = 3,
+  MIN = 4,
+  MAX = 5,
+  NEG = 6,
+  ABS = 7,
+  SQR_DIFF = 8,
+  POW = 9,
+  EQUAL = 10,
+  FLOOR_DIV = 11,
+  NONE = 12,
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_COMMON_ELTWISE_TYPE_H_
diff --git a/mace/ops/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h
similarity index 96%
rename from mace/ops/gemmlowp_util.h
rename to mace/ops/common/gemmlowp_util.h
index c7091544ef5d90ef5fa11cbaacb052744dbe0ef0..c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500 100644
--- a/mace/ops/gemmlowp_util.h
+++ b/mace/ops/common/gemmlowp_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_GEMMLOWP_UTIL_H_
-#define MACE_OPS_GEMMLOWP_UTIL_H_
+#ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
+#define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
 
 #include <tuple>
 
@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
 };
 }  // namespace mace
 
-#endif  // MACE_OPS_GEMMLOWP_UTIL_H_
+#endif  // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index 47e95a37190cbf2eb6aed08af544220ad9ce8643..6b2ac58a23e3ebbcb59e72300b682cd809263cca 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -16,6 +16,7 @@
 
 #include "mace/core/operator.h"
 #include "mace/utils/quantize.h"
+#include "mace/utils/memory.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/concat.h"
@@ -59,9 +60,9 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
     MACE_UNUSED(context);
     if (!checked_) {
       Validate();
-      auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-          "data_format", DataFormat::DF_NONE));
-      if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+      auto has_df = Operation::GetOptionalArg<int>(
+          "has_data_format", 0);
+      if (has_df && this->Input(0)->dim_size() == 4) {
         if (axis_ == 3) axis_ = 1;
         else if (axis_ == 2) axis_ = 3;
         else if (axis_ == 1) axis_ = 2;
@@ -199,7 +200,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
   explicit ConcatOp(OpConstructContext *context)
       : ConcatOpBase(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ConcatKernel<T>(axis_));
+      kernel_ = make_unique<opencl::image::ConcatKernel<T>>(axis_);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -250,9 +251,12 @@ void RegisterConcat(OpRegistryBase *op_registry) {
               if (op->output_shape(0).dims_size() != 4) {
                 return { DeviceType::CPU };
               } else {
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
                 int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                     *op, "axis", 3);
-                if (axis != 3) {
+                if (!has_data_format || axis != 3) {
                   return { DeviceType::CPU };
                 }
                 bool divisible_four = true;
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 88061a7b19804b9fda948bdc7c556fd2b81638fa..22eb544f96f15465177170868bdf4e68bcf46ab4 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -91,6 +91,7 @@ void OpenCLConcatHelper(int iters,
       .Input("Input0")
       .Input("Input1")
       .AddIntArg("axis", concat_dim)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index fc57920b7fe7a7e3ca2d4aca8bb7fd80a2d76aa7..bc41b11e394835e22ad3670d49e67781ec4ea372 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -100,11 +100,12 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) {
   }
 }
 
-TEST_F(ConcatOpTest, CPURandom) {
+namespace {
+void CPURandomTest(int input_dim, int has_data_format) {
   static unsigned int seed = time(NULL);
-  int dim = 5;
+  int dim = input_dim;
   int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = 1;
+  int axis = 3;
   // Construct graph
   OpsTestNet net;
   auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -112,9 +113,13 @@ TEST_F(ConcatOpTest, CPURandom) {
     builder = builder.Input(MakeString("Input", i));
   }
   builder.AddIntArg("axis", axis)
+      .AddIntArg("has_data_format", has_data_format)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
+  if (has_data_format) {
+    axis = 1;
+  }
   std::vector<index_t> shape_data;
   GenerateRandomIntTypeData<index_t>({dim}, &shape_data, 1, dim);
   std::vector<std::vector<index_t>> input_shapes(num_inputs, shape_data);
@@ -152,6 +157,13 @@ TEST_F(ConcatOpTest, CPURandom) {
     }
   }
 }
+}  // namespace
+
+TEST_F(ConcatOpTest, CPURandom) {
+  CPURandomTest(5, 0);
+  CPURandomTest(4, 0);
+  CPURandomTest(4, 1);
+}
 
 TEST_F(ConcatOpTest, QuantizedCPURandom) {
   static unsigned int seed = time(NULL);
@@ -186,7 +198,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
     builder = builder.Input(MakeString("Input", i));
   }
   builder.AddIntArg("axis", axis_arg)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
@@ -248,7 +260,7 @@ namespace {
 template <typename T>
 void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
                       const int axis,
-                      DataFormat data_format) {
+                      bool has_data_format) {
   srand(time(nullptr));
   int num_inputs = shapes.size();
   int concat_axis_size = 0;
@@ -275,7 +287,7 @@ void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
   builder.AddIntArg("axis", axis)
       .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("data_format", data_format)
+      .AddIntArg("has_data_format", has_data_format)
       .OutputShape(expected_shape)
       .Finalize(net.NewOperatorDef());
 
@@ -309,38 +321,37 @@ void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
 }  // namespace
 
 TEST_F(ConcatOpTest, OPENCLAligned) {
-  OpenCLRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3,
-                          DataFormat::NHWC);
+  OpenCLRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1);
 }
 
 TEST_F(ConcatOpTest, OPENCLHalfAligned) {
-  OpenCLRandomTest<half>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3,
-                         DataFormat::NHWC);
+  OpenCLRandomTest<half>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1);
 }
 
 TEST_F(ConcatOpTest, OPENCLUnAligned) {
-  OpenCLRandomTest<float>({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3,
-                          DataFormat::NHWC);
+  OpenCLRandomTest<float>({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, 1);
 }
 
 TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
   OpenCLRandomTest<float>(
       {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}},
-      3, DataFormat::NHWC);
+      3, 1);
 }
 
 TEST_F(ConcatOpTest, GPUFallbackToCPU2DInput) {
-  OpenCLRandomTest<float>({{3, 4}, {3, 4}}, 1, DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{3, 4}, {3, 4}}, 1, 0);
 }
 
 TEST_F(ConcatOpTest, GPUFallbackToCPUChanNotDivisibleBy4) {
-  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3,
-                          DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, 0);
+}
+
+TEST_F(ConcatOpTest, GPUFallbackToCPUNoDataFormat) {
+  OpenCLRandomTest<float>({{1, 1, 4, 4}, {1, 1, 4, 4}}, 3, 0);
 }
 
 TEST_F(ConcatOpTest, GPUFallbackToCPUAxis2) {
-  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2,
-                          DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, 0);
 }
 
 }  // namespace test
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 19794b38be56fe3a99deb0583b0967575de571ae..a6421f45fed1b0520e468acaae58c5439c8c03e3 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
 #include <algorithm>
@@ -27,21 +27,26 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/ops/arm/conv_winograd.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 
 #ifdef MACE_ENABLE_NEON
 #include "mace/ops/arm/fp32/conv_2d.h"
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
-#else
-#include "mace/ops/ref/conv_2d.h"
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"
+#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"
+#include "mace/ops/arm/fp32/conv_2d_1xn.h"
+#include "mace/ops/arm/fp32/conv_general.h"
 #endif  // MACE_ENABLE_NEON
 
+#include "mace/ops/ref/conv_2d.h"
+
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/quantization_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
@@ -54,22 +59,20 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class Conv2dOp;
 
-template <>
+template<>
 class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
         activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
+                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
         leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)),
-        is_filter_transformed_(false),
-        conv2d_delegator_(nullptr) {}
+            "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -77,445 +80,99 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
 
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    std::vector<index_t> filter_shape(4);
-    filter_shape = filter->shape();
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    index_t dilation_h = dilations_[0];
-    index_t dilation_w = dilations_[1];
-
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    if (paddings_.empty()) {
-      CalcNCHWPaddingAndOutputSize(input->shape().data(),
-                                   filter_shape.data(),
-                                   dilations_.data(),
-                                   strides_.data(),
-                                   padding_type_,
-                                   output_shape.data(),
-                                   paddings.data());
-    } else {
-      paddings = paddings_;
-      CalcNCHWOutputSize(input->shape().data(),
-                         filter_shape.data(),
-                         paddings_.data(),
-                         dilations_.data(),
-                         strides_.data(),
-                         RoundType::FLOOR,
-                         output_shape.data());
-    }
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
-    MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
-    MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
-               input_channels);
+    const index_t channels = filter->dim(0);
 
 #ifdef MACE_ENABLE_NEON
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-    index_t filter_h = filter->dim(2);
-    index_t filter_w = filter->dim(3);
-
-    if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
-        && dilation_h == 1 && dilation_w == 1) {
-      if (conv2d_delegator_.get() == nullptr) {
-        conv2d_delegator_.reset(new arm::fp32::Conv2dK1x1());
-      }
-      conv2d_delegator_->Compute(context, input, filter, output);
-    } else {
-      // TODO(liyin): the code below needs to be refactored.
-      // delegate to each of kernels instead of ruling them all
-      index_t padded_input_height = input_height + paddings[0];
-      index_t padded_input_width = input_width + paddings[1];
-      index_t extra_input_height = padded_input_height;
-      index_t extra_input_width = padded_input_width;
-      index_t extra_output_height = height;
-      index_t extra_output_width = width;
-
-      int pad_top = paddings[0] >> 1;
-      int pad_bottom = paddings[0] - pad_top;
-      int pad_left = paddings[1] >> 1;
-      int pad_right = paddings[1] - pad_left;
-
-      Tensor::MappingGuard input_guard(input);
-      Tensor::MappingGuard filter_guard(filter);
-      Tensor::MappingGuard output_guard(output);
-
-      auto filter_data = filter->data<float>();
-      auto output_data = output->mutable_data<float>();
-
-      std::function<void(const float *input, float *output)> conv_func;
-
-      bool
-          use_winograd = filter_h == 3 && filter_w == 3
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1
-          && input_channels >= 8 && channels >= 8;
-      bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
+    // the following params are used to decide which conv delegator to use
+    const index_t stride_h = strides_[0];
+    const index_t stride_w = strides_[1];
+    const index_t dilation_h = dilations_[0];
+    const index_t dilation_w = dilations_[1];
+    const index_t filter_h = filter->dim(2);
+    const index_t filter_w = filter->dim(3);
+    const index_t input_channels = input->dim(1);
+
+    // NOTE: delegator is fixed after first round of running,
+    // although winograd depends on input params.
+    // We do not support changeable filter for now.
+    if (conv2d_delegator_.get() == nullptr) {
+      if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+          && dilation_h == 1 && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 3 && filter_w == 3
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
+          && dilation_w == 1) {
+        if (input_channels >= 8 && channels >= 8) {
+          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3Winograd>(
+              paddings_, padding_type_);
+        } else {
+          conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S1>(
+              paddings_, padding_type_);
+        }
+      } else if (filter_h == 3 && filter_w == 3
           && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK3x3S2>(
+            paddings_, padding_type_);
+      } else if (filter_h == 5 && filter_w == 5
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK5x5S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x1_s1 = filter_h == 7 && filter_w == 1
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7
-          && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 2 && stride_w == 2 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S2>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 7
           && stride_h == 3 && stride_w == 3 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x7S3>(
+            paddings_, padding_type_);
+      } else if (filter_h == 1 && filter_w == 7
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-      bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x7S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 7 && filter_w == 1
           && stride_h == 1 && stride_w == 1 && dilation_h == 1
-          && dilation_w == 1;
-
-      std::vector<index_t> transformed_input_shape;
-      std::vector<index_t> transformed_output_shape;
-      std::vector<index_t> transformed_filter_shape;
-
-      // When size of input feature map is bigger than 16x16,
-      // set winograd out tile size to 6 to get higher performance.
-      index_t winograd_out_tile_size = 2;
-      if (input_height > 16 && input_width > 16) {
-        winograd_out_tile_size = 6;
-      }
-
-      if (use_winograd) {
-        extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
-        extra_input_height =
-            std::max(padded_input_height, extra_output_height + 2);
-        extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
-        extra_input_width =
-            std::max(padded_input_width, extra_output_width + 2);
-        if (extra_input_height != padded_input_height) {
-          pad_bottom += (extra_input_height - padded_input_height);
-        }
-        if (extra_input_width != padded_input_width) {
-          pad_right += (extra_input_width - padded_input_width);
-        }
-
-        index_t
-            tile_height_count = extra_output_height / winograd_out_tile_size;
-        index_t tile_width_count = extra_output_width / winograd_out_tile_size;
-        index_t tile_count = tile_height_count * tile_width_count;
-        index_t in_tile_area =
-            (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
-
-        transformed_input_shape.insert(transformed_input_shape.end(),
-                                       {in_tile_area, batch, input_channels,
-                                        tile_count});
-        transformed_output_shape.insert(transformed_output_shape.end(),
-                                        {in_tile_area, batch, channels,
-                                         tile_count});
-        transformed_filter_shape.insert(transformed_filter_shape.end(),
-                                        {in_tile_area, channels,
-                                         input_channels});
-      } else {
-        index_t tile_h, tile_w;
-        if (use_neon_3x3_s1) {
-          tile_h = 2;
-          tile_w = 4;
-        } else if (use_neon_7x1_s1 || use_neon_15x1_s1) {
-          tile_h = 4;
-          tile_w = 1;
-        } else {
-          tile_h = 1;
-          tile_w = 4;
-        }
-        extra_output_height = RoundUp<index_t>(height, tile_h);
-        extra_input_height =
-            std::max(padded_input_height, (extra_output_height - 1) * stride_h
-                + (filter_h - 1) * dilation_h + 1);
-        extra_output_width = RoundUp<index_t>(width, tile_w);
-        extra_input_width =
-            std::max(padded_input_width, (extra_output_width - 1) * stride_w
-                + (filter_w - 1) * dilation_w + 1);
-        if (extra_input_height != padded_input_height) {
-          pad_bottom += (extra_input_height - padded_input_height);
-        }
-        if (extra_input_width != padded_input_width) {
-          pad_right += (extra_input_width - padded_input_width);
-        }
-      }
-
-      // decide scratch size before allocate it
-      index_t total_scratch_size = 0;
-      index_t transformed_input_size = 0;
-      index_t transformed_output_size = 0;
-      index_t padded_input_size = 0;
-      index_t padded_output_size = 0;
-      if (use_winograd) {
-        transformed_input_size =
-            std::accumulate(transformed_input_shape.begin(),
-                            transformed_input_shape.end(),
-                            1,
-                            std::multiplies<index_t>()) * sizeof(float);
-        transformed_output_size =
-            std::accumulate(transformed_output_shape.begin(),
-                            transformed_output_shape.end(),
-                            1,
-                            std::multiplies<index_t>()) * sizeof(float);
-        total_scratch_size += transformed_input_size + transformed_output_size;
-      }
-      if (extra_input_height != input_height
-          || extra_input_width != input_width) {
-        padded_input_size =
-            batch * input_channels * (input_height + pad_top + pad_bottom)
-                * (input_width + pad_left + pad_right) * sizeof(float) +
-                MACE_EXTRA_BUFFER_PAD_SIZE;
-        total_scratch_size += padded_input_size;
-      }
-      if (extra_output_height != height || extra_output_width != width) {
-        padded_output_size =
-            batch * channels * extra_output_height * extra_output_width
-                * sizeof(float);
-        total_scratch_size += padded_output_size;
-      }
-
-      if (use_winograd) {
-        total_scratch_size += transformed_input_size + transformed_output_size;
-      }
-
-      // Init scratch buffer
-      ScratchBuffer *scratch = context->device()->scratch_buffer();
-      scratch->Rewind();
-      scratch->GrowSize(total_scratch_size);
-      Tensor
-          transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
-      Tensor
-          transformed_output
-          (scratch->Scratch(transformed_output_size), DT_FLOAT);
-      Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
-      Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
-      const index_t extra_input_shape[4] =
-          {batch, input_channels, extra_input_height, extra_input_width};
-      const index_t extra_output_shape[4] =
-          {batch, channels, extra_output_height, extra_output_width};
-
-      // make host compiler happy
-      MACE_UNUSED(extra_input_shape);
-      MACE_UNUSED(extra_output_shape);
-
-      Tensor transformed_filter;
-
-      // decide which convolution function to call
-      if (use_winograd) {
-        transformed_input.Reshape(transformed_input_shape);
-        transformed_output.Reshape(transformed_output_shape);
-        const float *transformed_filter_data = nullptr;
-        // filter only needs to be transformed once, set transformed_filter_data
-        // to null after the first run.
-        if (!is_filter_transformed_) {
-          transformed_filter.Resize(transformed_filter_shape);
-          switch (winograd_out_tile_size) {
-            case 2:
-              TransformFilter4x4(filter_data,
-                                 filter_shape[1],
-                                 filter_shape[0],
-                                 transformed_filter.mutable_data<float>());
-              break;
-            case 6:
-              TransformFilter8x8(filter_data,
-                                 filter_shape[1],
-                                 filter_shape[0],
-                                 transformed_filter.mutable_data<float>());
-              break;
-            default:MACE_NOT_IMPLEMENTED;
-          }
-          transformed_filter_data = transformed_filter.data<float>();
-          is_filter_transformed_ = true;
-        }
-
-        float *transformed_input_data = transformed_input.mutable_data<float>();
-        float
-            *transformed_output_data = transformed_output.mutable_data<float>();
-
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          WinoGradConv3x3s1(pad_input,
-                            transformed_filter_data,
-                            batch,
-                            extra_input_height,
-                            extra_input_width,
-                            input_channels,
-                            channels,
-                            winograd_out_tile_size,
-                            transformed_input_data,
-                            transformed_output_data,
-                            pad_output,
-                            &sgemm_,
-                            scratch);
-        };
-      } else if (use_neon_3x3_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK3x3S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_3x3_s2) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK3x3S2(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_5x5_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK5x5S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_1x7_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK1x7S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x1_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x1S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S1(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s2) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S2(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_7x7_s3) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK7x7S3(pad_input,
-                           filter_data,
-                           extra_input_shape,
-                           extra_output_shape,
-                           pad_output);
-        };
-      } else if (use_neon_1x15_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK1x15S1(pad_input,
-                            filter_data,
-                            extra_input_shape,
-                            extra_output_shape,
-                            pad_output);
-        };
-      } else if (use_neon_15x1_s1) {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dNeonK15x1S1(pad_input,
-                            filter_data,
-                            extra_input_shape,
-                            extra_output_shape,
-                            pad_output);
-        };
-      } else {
-        conv_func = [=](const float *pad_input, float *pad_output) {
-          Conv2dGeneral(pad_input,
-                        filter_data,
-                        extra_input_shape,
-                        extra_output_shape,
-                        filter_shape.data(),
-                        strides_.data(),
-                        dilations_.data(),
-                        pad_output);
-        };
-      }
-
-      // pad input and output
-      const Tensor *pad_input_ptr = input;
-      if (extra_input_height != input_height
-          || extra_input_width != input_width) {
-        MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(
-            input, pad_top, pad_bottom, pad_left, pad_right, &padded_input));
-        pad_input_ptr = &padded_input;
-      }
-
-      // TODO(libin): don't need clear after bias is integrated in each conv
-      Tensor *pad_output_ptr = output;
-      if (extra_output_height != height || extra_output_width != width) {
-        padded_output.Reshape({batch, channels, extra_output_height,
-                               extra_output_width});
-        padded_output.Clear();
-        pad_output_ptr = &padded_output;
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK7x1S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 1 && filter_w == 15
+          && stride_h == 1 && stride_w == 1 && dilation_h == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x15S1>(
+            paddings_, padding_type_);
+      } else if (filter_h == 15 && filter_w == 1
+          && stride_h == 1 && stride_w == 1 && dilation_h == 1
+          && dilation_w == 1) {
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dK15x1S1>(
+            paddings_, padding_type_);
       } else {
-        output->Clear();
-      }
-
-      const float *pad_input_data = pad_input_ptr->data<float>();
-      float *pad_output_data = pad_output_ptr->mutable_data<float>();
-
-      conv_func(pad_input_data, pad_output_data);
-
-      // unpack output
-      if (extra_output_height != height || extra_output_width != width) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch; ++b) {
-          for (index_t c = 0; c < channels; ++c) {
-            for (index_t h = 0; h < height; ++h) {
-              memcpy(
-                  output_data + b * channels * height * width
-                      + c * height * width
-                      + h * width,
-                  pad_output_data
-                      + b * channels * extra_output_height * extra_output_width
-                      + c * extra_output_height * extra_output_width
-                      + h * extra_output_width,
-                  sizeof(float) * width);
-            }
-          }
-        }
+        conv2d_delegator_ = make_unique<arm::fp32::Conv2dGeneral>(
+            strides_,
+            dilations_,
+            paddings_,
+            padding_type_);
       }
     }
+
+    conv2d_delegator_->Compute(context, input, filter, output);
 #else
-    if (conv2d_delegator_.get() == nullptr) {
-      conv2d_delegator_.reset(new ref::Conv2d<float>(paddings[0],
-                                                     paddings[1],
-                                                     stride_h,
-                                                     stride_w,
-                                                     dilation_h,
-                                                     dilation_w));
+    if (ref_conv2d_delegator_.get() == nullptr) {
+      ref_conv2d_delegator_ = make_unique<ref::Conv2d<float>>(strides_,
+                                                              dilations_,
+                                                              paddings_,
+                                                              padding_type_);
     }
-    conv2d_delegator_->Compute(context, input, filter, output);
+    ref_conv2d_delegator_->Compute(context, input, filter, output);
 #endif
 
     Tensor::MappingGuard bias_guard(bias);
@@ -523,6 +180,9 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
     auto output_data = output->mutable_data<float>();
     if (bias_data != nullptr) {
+      const index_t batch = input->dim(0);
+      const index_t height = output->dim(2);
+      const index_t width = output->dim(3);
       const index_t image_size = height * width;
 #pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
@@ -555,188 +215,13 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
   }
 
  private:
-  void Conv2dGeneral(const float *input,
-                     const float *filter,
-                     const index_t *in_shape,
-                     const index_t *out_shape,
-                     const index_t *filter_shape,
-                     const int *stride_hw,
-                     const int *dilation_hw,
-                     float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = filter_shape[1] * in_image_size;
-    const index_t out_batch_size = filter_shape[0] * out_image_size;
-    const index_t filter_size = filter_shape[2] * filter_shape[3];
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (index_t b = 0; b < in_shape[0]; b++) {
-      for (index_t m = 0; m < filter_shape[0]; m += 4) {
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t out_channels = filter_shape[0];
-        const index_t in_channels = filter_shape[1];
-
-        const int stride_h = stride_hw[0];
-        const int stride_w = stride_hw[1];
-        const int dilation_h = dilation_hw[0];
-        const int dilation_w = dilation_hw[1];
-        if (m + 3 < out_channels) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base = out_ptr0_base + out_image_size;
-          float *out_ptr2_base = out_ptr1_base + out_image_size;
-          float *out_ptr3_base = out_ptr2_base + out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 =
-                filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
-            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
-            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // input offset
-                index_t ih = h * stride_h;
-                index_t iw = w * stride_w;
-                index_t in_offset = ih * in_width + iw;
-                // output (4 outch x 1 height x 4 width): vo_outch_height
-                float vo0[4], vo1[4], vo2[4], vo3[4];
-                // load output
-                index_t out_offset = h * out_width + w;
-                for (index_t ow = 0; ow < 4; ++ow) {
-                  vo0[ow] = out_ptr0_base[out_offset + ow];
-                  vo1[ow] = out_ptr1_base[out_offset + ow];
-                  vo2[ow] = out_ptr2_base[out_offset + ow];
-                  vo3[ow] = out_ptr3_base[out_offset + ow];
-                }
-                // calc by row
-                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                    // outch 0
-                    vo0[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    // outch 1
-                    vo1[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    // outch 2
-                    vo2[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    // outch 3
-                    vo3[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                  }  // kw
-
-                  in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_shape[3];
-                  filter_ptr1 += filter_shape[3];
-                  filter_ptr2 += filter_shape[3];
-                  filter_ptr3 += filter_shape[3];
-                }  // kh
-
-                for (index_t ow = 0; ow < 4; ++ow) {
-                  out_ptr0_base[out_offset + ow] = vo0[ow];
-                  out_ptr1_base[out_offset + ow] = vo1[ow];
-                  out_ptr2_base[out_offset + ow] = vo2[ow];
-                  out_ptr3_base[out_offset + ow] = vo3[ow];
-                }
-
-                filter_ptr0 -= filter_size;
-                filter_ptr1 -= filter_size;
-                filter_ptr2 -= filter_size;
-                filter_ptr3 -= filter_size;
-              }  // w
-            }  // h
-          }  // c
-        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
-            float *out_ptr0_base =
-                output + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
-              const float *in_ptr_base =
-                  input + b * in_batch_size + c * in_image_size;
-              const float *filter_ptr0 =
-                  filter + mm * in_channels * filter_size + c * filter_size;
-
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
-                  // input offset
-                  index_t ih = h * stride_h;
-                  index_t iw = w * stride_w;
-                  index_t in_offset = ih * in_width + iw;
-                  // output (1 outch x 1 height x 4 width): vo_outch_height
-                  float vo0[4];
-                  // load output
-                  index_t out_offset = h * out_width + w;
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                    vo0[ow] = out_ptr0_base[out_offset + ow];
-                  }
-
-                  // calc by row
-                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                      // outch 0
-                      vo0[0] += in_ptr_base[in_offset
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[1] += in_ptr_base[in_offset + stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                    }  // kw
-
-                    in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_shape[3];
-                  }  // kh
-
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                    out_ptr0_base[out_offset + ow] = vo0[ow];
-                  }
-                  filter_ptr0 -= filter_size;
-                }  // w
-              }  // h
-            }  // c
-          }  // mm
-        }  // if
-      }  // m
-    }  // b
-  }
   const ActivationType activation_;
   const float relux_max_limit_;
   const float leakyrelu_coefficient_;
-  bool is_filter_transformed_;
-  SGemm sgemm_;
 #ifdef MACE_ENABLE_NEON
   std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
 #else
-  std::unique_ptr<ref::Conv2d<float>> conv2d_delegator_;
+  std::unique_ptr<ref::Conv2d<float>> ref_conv2d_delegator_;
 #endif  // MACE_ENABLE_NEON
 
  private:
@@ -744,7 +229,6 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-
 #ifdef MACE_ENABLE_QUANTIZE
 template <>
 class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
@@ -848,7 +332,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
       ScratchBuffer *scratch = context->device()->scratch_buffer();
       scratch->Rewind();
       scratch->GrowSize(im2col_size);
-      im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8));
+      im2col = make_unique<Tensor>(scratch->Scratch(im2col_size), DT_UINT8);
       uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
       Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
              stride_w, static_cast<uint8_t>(input->zero_point()),
@@ -993,10 +477,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::Conv2dKernel<T>);
+      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
     } else {
       mem_type = MemoryType::GPU_BUFFER;
-      kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
+      kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
     }
     context->set_output_mem_type(mem_type);
     // Transform filter tensor to target format
@@ -1051,7 +535,6 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterConv2D(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index 0d41845795ecf6b50a9016c99e4e84e0c05d120c..3dda169dd80f02a258d854ce88c7f511beab0167 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -15,6 +15,8 @@
 #include <memory>
 
 #include "mace/core/operator.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/crop.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -23,12 +25,24 @@ namespace mace {
 namespace ops {
 
 template <DeviceType D, class T>
-class CropOp : public Operation {
+class CropOp;
+
+template <class T>
+class CropOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit CropOp(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 2)),
-        offset_(Operation::GetRepeatedArgs<int>("offset")) {}
+        offset_(Operation::GetRepeatedArgs<int>("offset")) {
+    MACE_CHECK(offset_.size() == 4,
+               "crop op only supports 4-dims inputs now.");
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df) {
+      // NHWC -> NCHW
+      offset_ = TransposeShape<int, int>(offset_, {0, 3, 1, 2});
+    }
+  }
+
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -46,21 +60,13 @@ class CropOp : public Operation {
 
     std::vector<index_t> output_shape(input0->shape());
     for (index_t i = 0; i < in0_dims; ++i) {
-      int32_t crop_offset = 0;
-      index_t new_size = input0->dim(i);
-      if (i >= axis_) {
-        new_size = input1->dim(i);
-        if (offset_.size() == 1) {
-          crop_offset = offset_[0];
-        } else if (offset_.size() > 1) {
-          crop_offset = offset_[i - axis_];
-        }
-        MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i))
-          << "the crop for dimension" << i << "is out of bound with size"
-          << input1->dim(i) << "and offset" << crop_offset;
+      if (offset_[i] >= 0) {
+        output_shape[i] = input1->dim(i);
+        offsets[i] = offset_[i];
+        MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+          << "the crop for dimension " << i << " is out of bound with size "
+          << input1->dim(i) << " and offset " << offsets[i];
       }
-      output_shape[i] = new_size;
-      offsets[i] = crop_offset;
     }
     MACE_RETURN_IF_ERROR(output->Resize(output_shape));
     T *output_data = output->mutable_data<T>();
@@ -102,7 +108,6 @@ class CropOp : public Operation {
   }
 
  private:
-  const int axis_;
   std::vector<int> offset_;
 };
 
@@ -112,10 +117,9 @@ class CropOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit CropOp(OpConstructContext *context)
       : Operation(context) {
-    const int axis = Operation::GetOptionalArg<int>("axis", 2);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::CropKernel<T>(
-          axis, Operation::GetRepeatedArgs<int>("offset")));
+      kernel_ = make_unique<opencl::image::CropKernel<T>>(
+          Operation::GetRepeatedArgs<int>("offset"));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index 4ca25b15a3cd607e9b8394bc090e502486cc93e7..724d8ca2958360e991031b003af59f4a3f27b183 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -21,107 +21,80 @@ namespace test {
 
 namespace {
 template <DeviceType D, typename T>
-void CropHelper(int iters, int crop_axis, int dim1, int offset) {
+void CropHelper(int iters,
+                const std::vector<index_t> &shape0,
+                const std::vector<index_t> &shape1,
+                int crop_axis,
+                int offset) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder("Crop", "CropBM")
-      .Input("Input0")
-      .Input("Input1")
-      .AddIntArg("axis", crop_axis)
-      .AddIntsArg("offset", {offset})
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
 
-  // Add input data
-  const int kDim0 = 100;
-  net.AddRandomInput<DeviceType::CPU, T>("Input0", {1, kDim0, dim1, dim1, });
-  net.AddRandomInput<DeviceType::CPU, T>("Input1",
-                                         {1, kDim0 / 2, dim1 / 2, dim1 / 2});
+  std::vector<int> offsets(4, -1);
 
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+  for (int i = crop_axis; i < 4; ++i) {
+    offsets[i] = offset;
   }
-  const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
-  testing::BytesProcessed(tot * sizeof(T));
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.RunOp(D);
-  }
-}
-}  // namespace
-
-#define MACE_BM_CROP_CPU_MACRO(AXIS, DIM, OFFSET)                     \
-  static void MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET(int iters) { \
-    CropHelper<DeviceType::CPU, float>(iters, AXIS, DIM, OFFSET);     \
-  }                                                               \
-  MACE_BENCHMARK(MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET)
-
-MACE_BM_CROP_CPU_MACRO(1, 256, 3);
-MACE_BM_CROP_CPU_MACRO(2, 256, 3);
-MACE_BM_CROP_CPU_MACRO(3, 512, 3);
-MACE_BM_CROP_CPU_MACRO(2, 512, 6);
-
-namespace {
-template <typename T>
-void OpenCLCropHelper(int iters,
-                      const std::vector<index_t> &shape0,
-                      const std::vector<index_t> &shape1,
-                      int crop_axis,
-                      int offset) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;
 
-  // Add input data
-  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
-  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
+  if (D == DeviceType::CPU) {
+    auto input_shape0 = TransposeShape<index_t, index_t>(shape0, {0, 3, 1, 2});
+    auto input_shape1 = TransposeShape<index_t, index_t>(shape1, {0, 3, 1, 2});
+    net.AddRandomInput<D, float>("Input0", input_shape0);
+    net.AddRandomInput<D, float>("Input1", input_shape1);
+  } else if (D == DeviceType::GPU) {
+    // Add input data
+    net.AddRandomInput<D, T>("Input0", shape0);
+    net.AddRandomInput<D, T>("Input1", shape1);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
 
   OpDefBuilder("Crop", "CropBM")
       .Input("Input0")
       .Input("Input1")
-      .AddIntArg("axis", crop_axis)
-      .AddIntsArg("offset", {offset})
+      .AddIntsArg("offset", offsets)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.RunOp(DeviceType::GPU);
+  net.Setup(D);
+  for (int i = 0; i < 1; ++i) {
+    net.Run();
   }
 
   const int64_t tot =
       static_cast<int64_t>(iters) *
-      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
+      (net.GetTensor("Input0")->size());
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
-    net.RunOp(DeviceType::GPU);
+    net.Run();
   }
 }
 }  // namespace
 
-#define MACE_BM_CROP_GPU_MACRO(N, H, W, C, AXIS, OFFSET, TYPE)            \
-  static void MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET##\
-  _##TYPE(int iters) {                                                        \
-    std::vector<index_t> shape0 = {N, H, W, C};                              \
-    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};              \
-    OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
-  }                                                                          \
-  MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
-  ##_##TYPE)
-
-MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, float);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, float);
+#define MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, DEVICE, TYPE)     \
+  static void MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET## \
+  _##DEVICE##_##TYPE(int iters) {                                      \
+    std::vector<index_t> shape0 = {N, H, W, C};                        \
+    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};        \
+    CropHelper<DEVICE, TYPE>(iters, shape0, shape1, AXIS, OFFSET);     \
+  }                                                                    \
+  MACE_BENCHMARK(MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
+  ##_##DEVICE##_##TYPE)
+
+#define MACE_BM_CROP(N, H, W, C, AXIS, OFFSET)               \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, CPU, float);  \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, float);  \
+  MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, half);
+
+MACE_BM_CROP(4, 32, 32, 32, 2, 4);
+MACE_BM_CROP(8, 32, 32, 64, 1, 0);
+MACE_BM_CROP(8, 32, 32, 128, 0, 0);
+MACE_BM_CROP(8, 32, 32, 256, 2, 4);
 
-MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, half);
-MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, half);
 
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index 872d3154491e22a63ed6e98621a63476ea70ebb5..213b8ce89a58b5745c4e5685c6a825442b5826ce 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -26,7 +26,6 @@ void RunCrop(const std::vector<index_t> &input_shape,
              const std::vector<float> &input_data,
              const std::vector<index_t> &input_shape2,
              const std::vector<int> &offset,
-             const int axis,
              const std::vector<index_t> &expected_shape,
              const std::vector<float> &expected_data) {
   OpsTestNet net;
@@ -39,7 +38,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
         .Input("Input1")
         .Output("Output")
         .AddIntsArg("offset", offset)
-        .AddIntArg("axis", axis)
+        .AddIntArg("has_data_format", 1)
         .Finalize(net.NewOperatorDef());
   } else if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input0",
@@ -55,7 +54,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
         .Input("InputNCHW1")
         .Output("OutputNCHW")
         .AddIntsArg("offset", offset)
-        .AddIntArg("axis", axis)
+        .AddIntArg("has_data_format", 1)
         .Finalize(net.NewOperatorDef());
   }
 
@@ -113,7 +112,7 @@ TEST_F(CropTest, SimpleCPU) {
                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                             2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                             3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
-                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2,
+                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1},
                             {1, 5, 5, 3},
                             {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                             2.0, 2.0, 2.0, 3.0, 3.0, 3.0,
@@ -168,7 +167,7 @@ TEST_F(CropTest, SimpleGPU) {
                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                             2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                             3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
-                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2,
+                            4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1},
                             {1, 5, 5, 3},
                             {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
                             2.0, 2.0, 2.0, 3.0, 3.0, 3.0,
diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0117270c80ce25bda50ab8e8461302b521c484e
--- /dev/null
+++ b/mace/ops/cumsum.cc
@@ -0,0 +1,152 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class CumsumOp;
+
+template <typename T>
+class CumsumOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit CumsumOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 0)),
+        exclusive_(Operation::GetOptionalArg<bool>("exclusive", false)),
+        reverse_(Operation::GetOptionalArg<bool>("reverse", false)),
+        checked_(false) {}
+
+  void Validate() {
+    const int32_t input_dims = this->Input(0)->dim_size();
+    axis_ =
+        axis_ < 0 ? axis_ + input_dims : axis_;
+    MACE_CHECK((0 <= axis_ && axis_ < input_dims),
+               "Expected concatenating axis in the range [", -input_dims, ", ",
+               input_dims, "], but got ", axis_);
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    if (!checked_) {
+      Validate();
+      bool has_data_format = Operation::GetOptionalArg<int>(
+          "has_data_format", 0);
+      if (has_data_format && this->Input(0)->dim_size() == 4) {
+        if (axis_ == 3) axis_ = 1;
+        else if (axis_ == 2) axis_ = 3;
+        else if (axis_ == 1) axis_ = 2;
+      }
+      checked_ = true;
+    }
+
+    const Tensor *input = this->Input(0);
+    const std::vector<index_t> input_shape = input->shape();
+
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
+
+    const index_t outer_size = std::accumulate(input_shape.begin(),
+                                               input_shape.begin() + axis_,
+                                               1,
+                                               std::multiplies<index_t>());
+    const index_t inner_size = std::accumulate(input_shape.begin() + axis_ + 1,
+                                               input_shape.end(),
+                                               1,
+                                               std::multiplies<index_t>());
+    const index_t cum_size = input_shape[axis_];
+
+    if (!reverse_) {
+#pragma omp parallel for
+      for (index_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+        index_t start_idx = outer_idx * cum_size * inner_size;
+        for (index_t cum_idx = 0; cum_idx < cum_size; ++cum_idx) {
+          if (cum_idx == 0) {
+            if (exclusive_) {
+              std::memset(output_ptr + start_idx,
+                          0,
+                          sizeof(T) * inner_size);
+            } else {
+              std::memcpy(output_ptr + start_idx,
+                          input_ptr + start_idx,
+                          sizeof(T) * inner_size);
+            }
+          } else {
+            index_t cur_idx = start_idx + cum_idx * inner_size;
+            index_t pre_idx = start_idx + (cum_idx - 1) * inner_size;
+            index_t input_idx = exclusive_ ? pre_idx : cur_idx;
+            for (index_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
+              output_ptr[cur_idx + inner_idx] =
+                output_ptr[pre_idx + inner_idx] +
+                input_ptr[input_idx + inner_idx];
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp parallel for
+      for (index_t outer_idx = outer_size - 1; outer_idx >= 0; --outer_idx) {
+        index_t start_idx = outer_idx * cum_size * inner_size;
+        for (index_t cum_idx = cum_size - 1; cum_idx >= 0; --cum_idx) {
+          index_t cur_idx = start_idx + cum_idx * inner_size;
+          if (cum_idx == cum_size - 1) {
+            if (exclusive_) {
+              std::memset(output_ptr + cur_idx,
+                          0,
+                          sizeof(T) * inner_size);
+            } else {
+              std::memcpy(output_ptr + cur_idx,
+                          input_ptr + cur_idx,
+                          sizeof(T) * inner_size);
+            }
+          } else {
+            index_t pre_idx = start_idx + (cum_idx + 1) * inner_size;
+            index_t input_idx = exclusive_ ? pre_idx : cur_idx;
+            for (index_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
+              output_ptr[cur_idx + inner_idx] =
+                output_ptr[pre_idx + inner_idx] +
+                input_ptr[input_idx + inner_idx];
+            }
+          }
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  int32_t axis_;
+  bool exclusive_;
+  bool reverse_;
+  bool checked_;
+};
+
+void RegisterCumsum(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/cumsum_benchmark.cc b/mace/ops/cumsum_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ca59fa0501fe92a35fbe0a02141cdd23a7c1198
--- /dev/null
+++ b/mace/ops/cumsum_benchmark.cc
@@ -0,0 +1,90 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class CumsumOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void Cumsum(int iters, int batch, int channels, int height, int width) {
+  mace::testing::StopTiming();
+
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  OpDefBuilder("Cumsum", "CumsumTest")
+    .Input("Input")
+    .Output("Output")
+    .AddIntArg("axis", 0)
+    .AddIntArg("exclusive", 0)
+    .AddIntArg("reverse", 0)
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_CUMSUM_MACRO(N, C, H, W, TYPE, DEVICE)                  \
+  static void MACE_BM_CUMSUM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
+      int iters) {                                                        \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    Cumsum<DEVICE, TYPE>(iters, N, C, H, W);                             \
+  }                                                                       \
+  MACE_BENCHMARK(MACE_BM_CUMSUM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
+
+#define MACE_BM_CUMSUM(N, C, H, W)                 \
+  MACE_BM_CUMSUM_MACRO(N, C, H, W, float, CPU);
+
+MACE_BM_CUMSUM(1, 1, 512, 512);
+MACE_BM_CUMSUM(1, 3, 128, 128);
+MACE_BM_CUMSUM(1, 3, 512, 512);
+MACE_BM_CUMSUM(1, 32, 112, 112);
+MACE_BM_CUMSUM(1, 64, 256, 256);
+MACE_BM_CUMSUM(1, 64, 512, 512);
+MACE_BM_CUMSUM(1, 128, 56, 56);
+MACE_BM_CUMSUM(1, 128, 256, 256);
+MACE_BM_CUMSUM(1, 256, 14, 14);
+MACE_BM_CUMSUM(1, 512, 14, 14);
+MACE_BM_CUMSUM(1, 1024, 7, 7);
+MACE_BM_CUMSUM(32, 1, 256, 256);
+MACE_BM_CUMSUM(32, 3, 256, 256);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/cumsum_test.cc b/mace/ops/cumsum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b111540c9040a391ae419d86e3c042b23954b5e
--- /dev/null
+++ b/mace/ops/cumsum_test.cc
@@ -0,0 +1,91 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class CumsumOpTest : public OpsTestBase {};
+
+namespace {
+template <typename T>
+void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
+                              const std::vector<float> &input,
+                              const int axis,
+                              const int exclusive,
+                              const int reverse,
+                              const std::vector<float> &output) {
+  // Construct graph
+  OpsTestNet net;
+
+  net.AddInputFromArray<CPU, T>("Input", shape, input);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("Cumsum", "CumsumTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntArg("axis", axis)
+    .AddIntArg("exclusive", exclusive)
+    .AddIntArg("reverse", reverse)
+    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+    .AddIntArg("has_data_format", 1)
+    .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(DeviceType::CPU);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(CumsumOpTest, HasDataFormatCPU) {
+  SimpleTestWithDataFormat<float>(
+      {2, 2, 2, 2},
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.},
+      0, 0, 0,
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 10., 12., 14., 16., 18., 20., 22.});
+  SimpleTestWithDataFormat<float>(
+      {2, 2, 2, 2},
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.},
+      1, 0, 0,
+      {0., 1., 2., 3., 4., 6., 8., 10., 8., 9., 10., 11., 20., 22., 24., 26.});
+  SimpleTestWithDataFormat<float>(
+      {2, 2, 2, 2},
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.},
+      0, 1, 0,
+      {0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 3., 4., 5., 6., 7.});
+  SimpleTestWithDataFormat<float>(
+      {2, 2, 2, 2},
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.},
+      0, 0, 1,
+      {8., 10., 12., 14., 16., 18., 20., 22., 8., 9., 10., 11., 12., 13., 14.,
+      15.});
+  SimpleTestWithDataFormat<float>(
+      {2, 2, 2, 2},
+      {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.},
+      1, 1, 1,
+      {4., 5., 6., 7., 0., 0., 0., 0., 12., 13., 14., 15., 0., 0., 0., 0.});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 22fa5c5bb6f95c637e4d9b96652293302697c769..6e9a0fa8db36209887f86d0fdc75d5c5d1a5c2bc 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -28,7 +28,8 @@
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/deconv_2d.h"
@@ -362,7 +363,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::Deconv2dKernel<T>);
+      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index ed9cdb539445b17810eaa685135ad12fbfc1a3ba..2460d75a258068c4e0f08576311bf93ace6b3289 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -97,7 +98,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
+      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index c61f13049d51a6ce6c3fe624c345052316f4a6d3..22b13c268de07a10ffc4dfc06fdad69c953a37f5 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -33,6 +33,7 @@
 #include "mace/ops/arm/depthwise_conv2d_neon.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/public/mace.h"
+#include "mace/utils/memory.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -493,19 +494,25 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
+      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
     } else {
       mem_type = MemoryType::GPU_BUFFER;
-      kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
+      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
     }
     context->set_output_mem_type(mem_type);
-    // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
-        context,
-        operator_def_.get(),
-        1,
-        OpenCLBufferType::DW_CONV2D_FILTER,
-        mem_type) == MaceStatus::MACE_SUCCESS);
+    Tensor *filter_tensor = context->workspace()->GetTensor(
+        operator_def_->input(1));
+    if (filter_tensor != nullptr && filter_tensor->is_weight()) {
+      // Transform filter tensor to target format
+      MACE_CHECK(TransformFilter<T>(
+          context,
+          operator_def_.get(),
+          1,
+          OpenCLBufferType::DW_CONV2D_FILTER,
+          mem_type) == MaceStatus::MACE_SUCCESS);
+    } else {
+      context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER);
+    }
     if (operator_def_->input_size() > 2) {
       MACE_CHECK(TransformFilter<T>(
           context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 06c55ab27a2f831bb681bb3ef2c39d96b44922b1..3d203cfa5678c1ca407b6db2d441890bc00785a5 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -26,8 +26,9 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/arm/depthwise_deconv2d_neon.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 #include "mace/public/mace.h"
+#include "mace/utils/memory.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/depthwise_deconv2d.h"
@@ -36,7 +37,7 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DepthwiseDeconv2dOp;
 
 template<>
@@ -91,10 +92,11 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
     const index_t pad_top = out_paddings[1] / 2;
 
     index_t padded_out_size =
-        std::accumulate(padded_out_shape.begin(),
-                        padded_out_shape.end(),
-                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
+        PadAlignSize(std::accumulate(padded_out_shape.begin(),
+                                     padded_out_shape.end(),
+                                     1,
+                                     std::multiplies<index_t>())
+                         * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE);
     ScratchBuffer *scratch = context->device()->scratch_buffer();
     scratch->Rewind();
     scratch->GrowSize(padded_out_size);
@@ -252,7 +254,6 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
                 padded_out_shape.data(),
                 out_data);
 
-
     if (!no_pad) {
       CropPadOut<float>(out_data,
                         padded_out_shape.data(),
@@ -383,7 +384,7 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
               const index_t out_offset =
                   i * strides[0] * out_width + j * strides[1];
               for (int q = 0; q < in_channels_g; ++q) {
-                const  index_t in_base =
+                const index_t in_base =
                     ((b * group + g) * in_channels_g + q) * in_img_size;
                 const index_t in_offset =
                     in_base + i * in_width + j;
@@ -412,7 +413,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
+      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index f035eeee579907fea2ddb77d04ca5c982c903b67..92864ae1016fad410ce054887babd09ee2557c59 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -12,6 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef MACE_ENABLE_NEON
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/arm/q8/eltwise.h"
+#endif  // MACE_ENABLE_QUANTIZE
+#endif  // MACE_ENABLE_NEON
+
 #include "mace/ops/eltwise.h"
 
 #include <algorithm>
@@ -24,6 +30,7 @@
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
+#include "mace/utils/memory.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
@@ -890,8 +897,8 @@ class EltwiseOp : public Operation {
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1)),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0))) {}
+        has_data_format_(Operation::GetOptionalArg<int>(
+            "has_data_format", 0)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -920,7 +927,9 @@ class EltwiseOp : public Operation {
                        const Tensor *input1,
                        Tensor *output) {
     bool swapped = false;
-    if (input0->size() < input1->size()) {
+    if (input0->dim_size() < input1->dim_size()
+        || (input0->dim_size() == input1->dim_size()
+            && input0->size() < input1->size())) {
       std::swap(input0, input1);
       swapped = true;
     }
@@ -931,7 +940,7 @@ class EltwiseOp : public Operation {
     // check if we can broadcast tensor
     uint32_t rank_diff =
         static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
-    if (data_format_ == NCHW) {
+    if (has_data_format_) {
       MACE_CHECK(
           (input0->dim_size() == 4) &&
               ((input1->dim_size() == 0) ||
@@ -956,7 +965,7 @@ class EltwiseOp : public Operation {
     const T *input0_ptr = input0->data<T>();
     const T *input1_ptr = input1->data<T>();
 
-    if (data_format_ == NCHW && input1->dim_size() > 0) {
+    if (has_data_format_ && input1->dim_size() > 0) {
       MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
       Tensor::MappingGuard output_guard(output);
       DstType *output_ptr = output->mutable_data<DstType>();
@@ -1018,7 +1027,7 @@ class EltwiseOp : public Operation {
   std::vector<float> coeff_;
   float scalar_input_;
   int32_t scalar_input_index_;
-  DataFormat data_format_;
+  int has_data_format_;
   Tensor scalar_tensor_;
 };
 
@@ -1033,21 +1042,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
-            "scalar_input_index", 1)),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0))) {}
+            "scalar_input_index", 1))
+#ifdef MACE_ENABLE_NEON
+        , eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE))))
+#endif
+  {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
     const Tensor *input0 = this->Input(0);
-    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    MACE_CHECK(this->InputSize() == 2,
+               "Quantized Elementwise don't support broadcast now.");
+    const Tensor *input1 = this->Input(1);
     Tensor *output = this->Output(0);
-    MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
+    MACE_CHECK(type_ == SUM || type_ == SUB,
+               "Quantized Elementwise only support SUM and SUB now.");
     MACE_CHECK(input0->size() == input1->size(),
                "input0 and input1 must have the same shape.");
     MACE_CHECK(output->scale() != 0);
     MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
 
+#ifdef MACE_ENABLE_NEON
+    eltwise_.Compute(context, input0, input1, output);
+#else
     constexpr int left_shift = 20;
     const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
     const double adjusted_input0_scale = input0->scale() / doubled_scale;
@@ -1078,57 +1096,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
     auto input0_ptr = input0->data<uint8_t>();
     auto input1_ptr = input1->data<uint8_t>();
     auto output_ptr = output->mutable_data<uint8_t>();
-
-    index_t handled_output_size = 0;
-#ifdef MACE_ENABLE_NEON
-    #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
-      const auto input0_val = vld1_u8(input0_ptr + i);
-      const auto input1_val = vld1_u8(input1_ptr + i);
-      const auto input0_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input0_val));
-      const auto input1_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input1_val));
-      const auto offset_input0 =
-          vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
-      const auto offset_input1 =
-          vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
-      auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
-      auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
-      auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
-      auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
-      const auto left_shift_dup = vdupq_n_s32(left_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
-      input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
-      input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
-      input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
-      input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
-      const auto input0_shift_dup = vdupq_n_s32(input0_shift);
-      const auto input1_shift_dup = vdupq_n_s32(input1_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
-      auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
-      auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
-      sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
-      sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
-      sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
-      sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
-      const auto sum_low_s16 = vmovn_s32(sum_low);
-      const auto sum_high_s16 = vmovn_s32(sum_high);
-      const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
-                                                     sum_high_s16),
-                                        vdupq_n_s16(output->zero_point()));
-      vst1_u8(output_ptr + i, vqmovun_s16(output_val));
-    }
-    handled_output_size = output->size() - output->size() % 8;
-#endif  // NEON
 #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i < output->size(); ++i) {
+    for (index_t i = 0; i < output->size(); ++i) {
       const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
       const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
       const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
@@ -1143,14 +1112,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
               gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
                                                           input1_multiplier),
               -input1_shift);
-      const int32_t sum = multiplied_input0 + multiplied_input1;
+
+      int32_t res;
+      if (type_ == SUM) {
+        res = multiplied_input0 + multiplied_input1;
+      } else {
+        res = multiplied_input0 - multiplied_input1;
+      }
+
       const int32_t output_val =
           gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(sum,
+              gemmlowp::SaturatingRoundingDoublingHighMul(res,
                                                           output_multiplier),
               -output_shift) + output->zero_point();
       output_ptr[i] = Saturate<uint8_t>(output_val);
     }
+#endif  // NEON
 
     return MaceStatus::MACE_SUCCESS;
   }
@@ -1160,8 +1137,10 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
   std::vector<float> coeff_;
   float scalar_input_;
   int32_t scalar_input_index_;
-  DataFormat data_format_;
   Tensor scalar_tensor_;
+#ifdef MACE_ENABLE_NEON
+  arm::q8::Eltwise eltwise_;
+#endif
 };
 #endif  // MACE_ENABLE_QUANTIZE
 
@@ -1181,8 +1160,8 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::EltwiseKernel<T>(
-          type, coeff, scalar_input, scalar_input_index));
+      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
+          type, coeff, scalar_input, scalar_input_index);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -1192,12 +1171,23 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     for (int i = 0; i < input_size; ++i) {
       if (ws->HasTensor(operator_def_->input(i)) &&
           ws->GetTensor(operator_def_->input(i))->is_weight()) {
-        MACE_CHECK(TransformFilter<T>(
-            context,
-            operator_def_.get(),
-            i,
-            OpenCLBufferType::ARGUMENT,
-            mem_type) == MaceStatus::MACE_SUCCESS);
+        if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
+          MACE_CHECK(TransformFilter<T>(
+              context,
+              operator_def_.get(),
+              i,
+              OpenCLBufferType::ARGUMENT,
+              mem_type) == MaceStatus::MACE_SUCCESS);
+        } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
+          MACE_CHECK(TransformFilter<T>(
+              context,
+              operator_def_.get(),
+              i,
+              OpenCLBufferType::IN_OUT_CHANNEL,
+              mem_type) == MaceStatus::MACE_SUCCESS);
+        } else {
+          MACE_NOT_IMPLEMENTED;
+        }
       }
     }
   }
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index c79c6c27abfb3cef4ed02abfacc3dea5384e1bd3..208d7f26549b6642502dcf6022983ad4f0f52622 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -15,25 +15,11 @@
 #ifndef MACE_OPS_ELTWISE_H_
 #define MACE_OPS_ELTWISE_H_
 
+#include "mace/ops/common/eltwise_type.h"
+
 namespace mace {
 namespace ops {
 
-enum EltwiseType {
-  SUM = 0,
-  SUB = 1,
-  PROD = 2,
-  DIV = 3,
-  MIN = 4,
-  MAX = 5,
-  NEG = 6,
-  ABS = 7,
-  SQR_DIFF = 8,
-  POW = 9,
-  EQUAL = 10,
-  FLOOR_DIV = 11,
-  NONE = 12,
-};
-
 inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
 
 }  // namespace ops
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 0bfb666f70d3fd606703e32bcd3a4baf3f788fa6..a1959e9df5c388dd6a3605538e83558f3d4e563d 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -30,12 +30,12 @@ void EltwiseBenchmark(
 
   OpsTestNet net;
   // Add input data
-  if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
-    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-  } else {
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value != DT_UINT8) {
     net.AddRandomInput<D, T>("Input0", {n, c, h, w});
     net.AddRandomInput<D, T>("Input1", {n, c, h, w});
+  } else {
+    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
+    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
   }
 
   OpDefBuilder("Eltwise", "EltwiseTest")
@@ -44,18 +44,25 @@ void EltwiseBenchmark(
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatsArg("coeff", {1.2, 2.1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(D);
+
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value == DT_UINT8) {
+    net.GetTensor("Output")->SetScale(0.1);
+  }
+
   // Warm-up
   for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
     net.Sync();
   }
 
   mace::testing::StartTiming();
   while (iters--) {
-    net.RunOp(D);
+    net.Run();
     net.Sync();
   }
 }
@@ -86,6 +93,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256);
 MACE_BM_ELTWISE(5, 1, 128, 128, 32);
 MACE_BM_ELTWISE(5, 1, 240, 240, 256);
 
+MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU);
+MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 7ca799e2e8701b8adb439218c17ce10d8fbd0f56..58306b625a5ce8e38b0b129c230a4401d3a06ae9 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -75,7 +75,7 @@ void SimpleTensorScalar(const ops::EltwiseType type,
         .AddIntArg("T", DataTypeToEnum<T>::v())
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatArg("scalar_input", x)
-        .AddIntArg("data_format", DataFormat::NCHW)
+        .AddIntArg("has_data_format", 1)
         .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
         .Output("TOutput")
         .Finalize(net.NewOperatorDef());
@@ -120,7 +120,7 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
             .AddIntArg("T", DataTypeToEnum<T>::v())
             .AddIntArg("type", static_cast<int>(type))
             .AddFloatsArg("coeff", coeff)
-            .AddIntArg("data_format", DataFormat::NCHW)
+            .AddIntArg("has_data_format", 1)
             .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
             .Output("TOutput");
     if (shape0.size() > 1) {
@@ -642,7 +642,7 @@ void RandomTensorScalar(const ops::EltwiseType type,
       .Input("TInput")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatArg("scalar_input", 0.1)
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("has_data_format", 1)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
   // Run
@@ -699,7 +699,7 @@ void RandomTensorEltwise(const ops::EltwiseType type,
       .Input("TInput1")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatsArg("coeff", coeff)
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("has_data_format", 1)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
 
@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
   }
 }
 
-void QuantizedSum(const std::vector<index_t> &shape) {
+void Quantized(const std::vector<index_t> &shape,
+               const ops::EltwiseType type) {
   // Construct graph
   OpsTestNet net;
 
@@ -753,8 +754,8 @@ void QuantizedSum(const std::vector<index_t> &shape) {
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput0")
       .Input("TInput1")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("type", static_cast<int>(type))
+      .AddIntArg("has_data_format", 1)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
 
@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
       .Input("QuantizedInput0")
       .Input("QuantizedInput1")
       .Output("QuantizedOutput")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(type))
       .AddIntArg("T", static_cast<int>(DT_UINT8))
       .Finalize(net.NewOperatorDef());
   net.Setup(DeviceType::CPU);
@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
       {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36});
 }
 
-TEST_F(EltwiseOpTest, QuantizedSum) {
-  QuantizedSum({1, 32, 32, 16});
-  QuantizedSum({1, 31, 31, 17});
+TEST_F(EltwiseOpTest, Quantized) {
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM);
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM);
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB);
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB);
 }
 
 }  // namespace test
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index c82aa8ff5332c850b70100b97b0c6c1cfb3c33d3..22d45ea7c5de05eff05f2ad1fa30c9bcd92f6b7d 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -38,6 +38,8 @@
 #include "mace/ops/opencl/image/fully_connected.h"
 #endif  // MACE_ENABLE_OPENCL
 
+#include "mace/utils/memory.h"
+
 namespace mace {
 namespace ops {
 
@@ -186,7 +188,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
     MemoryType mem_type;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
+      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc
index 32f849c7abf69318fe2fdcb9dcacb97bc437aec0..2c0f474ca7aa9437328f0319ba6538c11f538d3d 100644
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -23,53 +23,67 @@ namespace test {
 class GatherOpTest : public OpsTestBase {};
 
 namespace {
+template<typename T>
 void TestGather(const std::vector<index_t> &weight_shape,
-                const std::vector<float> &weight,
+                const std::vector<T> &weight,
                 const std::vector<index_t> &input_shape,
                 const std::vector<int32_t> &input,
                 const int axis,
                 const std::vector<index_t> &output_shape,
-                const std::vector<float> &output) {
+                const std::vector<T> &output) {
   OpsTestNet net;
 
-  net.AddInputFromArray<CPU, float>("Params", weight_shape, weight);
+  net.AddInputFromArray<CPU, T>("Params", weight_shape, weight);
   net.AddInputFromArray<CPU, int32_t>("Indices", input_shape, input);
 
   OpDefBuilder("Gather", "GatherTest")
       .Input("Params")
       .Input("Indices")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
       .AddIntArg("axis", axis)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(CPU);
 
-  auto expected = net.CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<T>(output_shape, output);
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 
 TEST_F(GatherOpTest, CPUScalarIndex) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {}, {5}, 0, {2}, {10, 11});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
              {}, {5}, 0, {2}, {10, 11});
 }
 
 TEST_F(GatherOpTest, CPURank1Index) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
              {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13});
 }
 
 TEST_F(GatherOpTest, CPURank1IndexWithAxis1) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
              {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19});
 }
 
 TEST_F(GatherOpTest, CPURankHighIndex) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
              {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13});
 }
diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc
index cd0d96b8cef49abad2e97cd60a81619065d51ebb..38f711f57ad824f146a4cd0abf306300b5122735 100644
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -34,9 +34,10 @@ class InferConv2dShapeOp : public Operation {
     Tensor::MappingGuard output_guard(output);
     int32_t *output_data = output->mutable_data<int32_t>();
 
-    const int32_t data_format =
-        Operation::GetOptionalArg<int>("data_format", 0);
-    const bool isNCHW = data_format == 1;
+    auto has_data_format =
+        Operation::GetOptionalArg<int>("has_data_format", 0);
+    const bool isNCHW = (has_data_format &&
+        input->data_format() == DataFormat::NCHW);
 
     Padding padding_type =
         static_cast<Padding>(Operation::GetOptionalArg<int>(
diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc
index feaaecff8364d9f1a3270105bc03ddb36e3f5be2..333baaf944b34d8e2e0d78cd4e3d84aefa950163 100644
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
@@ -57,8 +57,8 @@ void TestInferConv2dShapeOp(const std::vector<index_t> &input_shape,
 }  // namespace
 
 TEST_F(InferConv2dShapeOpTest, TestInferConv2dShape) {
-TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3});
-TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3});
+  TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3});
+  TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3});
 }
 
 }  // namespace test
diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc
index a342cef812847070f63ed048045159185d28f0a5..82ed9053b6d05a40c2e31e6854c0ec16c62f7ae8 100644
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -18,6 +18,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/lstm_cell.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -25,6 +26,7 @@ namespace ops {
 template <DeviceType D, class T>
 class LSTMCellOp;
 
+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class LSTMCellOp<DeviceType::GPU, T> : public Operation {
  public:
@@ -35,7 +37,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
                                          0.0));
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
+      kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -88,6 +90,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
   MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
   MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
 };
+#endif
 
 void RegisterLSTMCell(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index a3aebcb49abe323a24bc792f857577481be19f35..3b0913de574607660b807ea133f3e797a30aca71 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -21,8 +21,7 @@
 
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/sgemm.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 #ifdef MACE_ENABLE_NEON
 #include "mace/ops/arm/fp32/gemm.h"
@@ -38,7 +37,7 @@
 #endif  // MACE_ENABLE_NEON
 
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
@@ -233,8 +232,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                   const index_t height,
                   const index_t K,
                   const index_t width,
-                  const bool lhs_bached,
-                  const bool rhs_bached,
+                  const bool lhs_batched,
+                  const bool rhs_batched,
                   Tensor *C) {
 #if defined(MACE_ENABLE_NEON)
     if (width == 1 && AOrder == gemmlowp::MapOrder::RowMajor) {
@@ -245,8 +244,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                            batch,
                            height,
                            K,
-                           true,
-                           true,
+                           lhs_batched,
+                           rhs_batched,
                            C);
     } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
       gemv_kernel_.Compute(context,
@@ -256,8 +255,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                            batch,
                            width,
                            K,
-                           true,
-                           true,
+                           lhs_batched,
+                           rhs_batched,
                            C);
     } else {
 #endif  // MACE_ENABLE_NEON
@@ -281,11 +280,13 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
 
       for (index_t i = 0; i < batch; ++i) {
         gemmlowp::MatrixMap<const uint8_t, AOrder>
-            a_matrix(a_ptr_base + static_cast<index_t>(lhs_bached) * i * a_size,
+            a_matrix(a_ptr_base
+                         + static_cast<index_t>(lhs_batched) * i * a_size,
                      height,
                      K);
         gemmlowp::MatrixMap<const uint8_t, BOrder>
-            b_matrix(b_ptr_base + static_cast<index_t>(rhs_bached) * i * b_size,
+            b_matrix(b_ptr_base
+                         + static_cast<index_t>(rhs_batched) * i * b_size,
                      K,
                      width);
         gemmlowp::MatrixMap <uint8_t, gemmlowp::MapOrder::RowMajor>
@@ -315,8 +316,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                   const index_t height,
                   const index_t K,
                   const index_t width,
-                  const bool lhs_bached,
-                  const bool rhs_bached,
+                  const bool lhs_batched,
+                  const bool rhs_batched,
                   Tensor *C) {
     C->SetScale(A->scale() * B->scale());
     C->SetZeroPoint(0);
@@ -330,8 +331,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                            batch,
                            height,
                            K,
-                           lhs_bached,
-                           rhs_bached,
+                           lhs_batched,
+                           rhs_batched,
                            C);
     } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
       gemv_kernel_.Compute(context,
@@ -341,8 +342,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                            batch,
                            width,
                            K,
-                           lhs_bached,
-                           rhs_bached,
+                           lhs_batched,
+                           rhs_batched,
                            C);
     } else {
 #endif  // MACE_ENABLE_NEON
@@ -366,12 +367,12 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
       for (index_t i = 0; i < batch; ++i) {
         gemmlowp::MatrixMap<const uint8_t, AOrder>
             a_matrix
-            (a_ptr_base + static_cast<index_t>(lhs_bached) * i * a_size,
+            (a_ptr_base + static_cast<index_t>(lhs_batched) * i * a_size,
              height,
              K);
         gemmlowp::MatrixMap<const uint8_t, BOrder>
             b_matrix
-            (b_ptr_base + static_cast<index_t>(rhs_bached) * i * b_size,
+            (b_ptr_base + static_cast<index_t>(rhs_batched) * i * b_size,
              K,
              width);
         gemmlowp::MatrixMap <int32_t, gemmlowp::MapOrder::RowMajor>
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index acacdc7ff91bbda1945415f8ed668a4e0fb63bbd..308113ffcc380d67fd39f89bcb487fce628d77e9 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -21,7 +21,6 @@
 #include "public/gemmlowp.h"
 #include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/sgemm.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace gemmlowp {
@@ -94,32 +93,6 @@ namespace test {
 
 namespace {
 
-// Matmul with (m, k) x (k, n)
-void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
-  mace::testing::StopTiming();
-  std::vector<float> lhs(m * k);
-  std::vector<float> rhs(k * n);
-  std::vector<float> result(m * n);
-
-  ops::SGemmMatrixMap<const float>
-      matrix_lhs(1, m, k, SGemmRowMajor, lhs.data(),
-                 true);
-  ops::SGemmMatrixMap<const float>
-      matrix_rhs(1, k, n, SGemmRowMajor, rhs.data(),
-                 true);
-  ops::SGemmMatrixMap<float>
-      matrix_result(1, m, n, SGemmRowMajor, result.data());
-
-  ops::SGemm sgemm;
-
-  sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    sgemm(matrix_lhs, matrix_rhs, &matrix_result);
-  }
-}
-
 void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
   mace::testing::StopTiming();
   Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
@@ -223,7 +196,6 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
   MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC)
 
 #define MACE_BM_MATMUL(M, K, N)                          \
-  MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float);       \
   MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float);            \
   MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
   MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index 741393ffea45d435b52156f38b3a3ddc4d0e5b84..f88ac39435e328ad2a4ada6b3c41a73558fdb791 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -135,7 +135,8 @@ void Complex(const std::vector<index_t> &batch,
                rhs_batched,
                &expected_output_tensor);
 
-  ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"));
+  ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"),
+      1e-4, 1e-2);
 }
 }  // namespace
 
diff --git a/mace/ops/opencl/activation.h b/mace/ops/opencl/activation.h
index 6eecb6416659f899e4926332a02695597782ee62..6e9b92242b499906fb3304fdcedfc1e739e9abb4 100644
--- a/mace/ops/opencl/activation.h
+++ b/mace/ops/opencl/activation.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_ACTIVATION_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/addn.h b/mace/ops/opencl/addn.h
index b78a7099646f77f40e5f2c058d90dcd414cb0dec..ba161ba641ed6d1c041e2e41aa547f1c45071e48 100644
--- a/mace/ops/opencl/addn.h
+++ b/mace/ops/opencl/addn.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/batch_norm.h b/mace/ops/opencl/batch_norm.h
index 6f91f95d67d4efe23ae9b8eb57142ea7ba1f3acd..bf49c994e127910632dfd2b2ce9d76a4855a29dc 100644
--- a/mace/ops/opencl/batch_norm.h
+++ b/mace/ops/opencl/batch_norm.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_BATCH_NORM_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/batch_to_space.h b/mace/ops/opencl/batch_to_space.h
index 4cf8db94399d13ed77391d80f8d7447b8edff59a..9bb62f7052d64ba437d44f5d0ed6403c475c3dde 100644
--- a/mace/ops/opencl/batch_to_space.h
+++ b/mace/ops/opencl/batch_to_space.h
@@ -19,7 +19,7 @@
 
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/bias_add.h b/mace/ops/opencl/bias_add.h
index d0b4469dd0154ae7234f984771ffa70509abeb32..80a86423c50922bec581b445206445cf2df83d41 100644
--- a/mace/ops/opencl/bias_add.h
+++ b/mace/ops/opencl/bias_add.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_BIAS_ADD_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h
index 736ecb2a420af7941490224b6f0c390abbb3bac9..4ef8d79d9304143d29ba35125ad0b0970af310cb 100644
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -22,6 +22,7 @@
 
 #include "mace/ops/opencl/buffer/utils.h"
 #include "mace/ops/opencl/helper.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -211,8 +212,8 @@ MaceStatus Conv2dKernel<T>::Compute(
       old_scratch_size_ = scratch->size();
     }
 
-    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
-                                  input->dtype()));
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                  input->dtype());
 
     padded_input->Resize(padded_input_shape);
     PadInput(context, &kernels_[0], input, pad_top, pad_left,
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h
index 74a3cb945158382fb9b546cdfee6d0091c1892c7..6a46334a787378441d84d020cf578042e6bd24b9 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -22,6 +22,7 @@
 
 #include "mace/ops/opencl/buffer/utils.h"
 #include "mace/ops/opencl/helper.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -165,8 +166,8 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
       old_scratch_size_ = scratch->size();
     }
 
-    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
-                                  input->dtype()));
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
 
     padded_input->Resize(padded_input_shape);
     PadInput(context, &kernels_[0], input, pad_top, pad_left,
diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
index ab1e6f85929298483339944d7eb97d0781023a04..4f153e4acfff75ab179e567803e05e14f67ceebf 100644
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -24,6 +24,7 @@
 
 #include "mace/ops/opencl/buffer/utils.h"
 #include "mace/ops/opencl/helper.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -124,8 +125,8 @@ MaceStatus PoolingKernel<T>::Compute(
       old_scratch_size_ = scratch->size();
     }
 
-    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
-                                  input->dtype()));
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
 
     padded_input->Resize(padded_input_shape);
     PadInput(context, &kernels_[0], input, 0, 0,
diff --git a/mace/ops/opencl/buffer_transform_kernel.h b/mace/ops/opencl/buffer_transform_kernel.h
index 4269b67d22ca157f28fcde4a0f607f9ae6e9a5df..47f1cbaf10f4cf70c0a1d9014ba0ad77261414fe 100644
--- a/mace/ops/opencl/buffer_transform_kernel.h
+++ b/mace/ops/opencl/buffer_transform_kernel.h
@@ -17,7 +17,7 @@
 
 #include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 class OpContext;
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index e65ae3701efe51068bb81a39e533f170502c792e..dbb6eab64c22f2941c2710f6a2730a527149f6c3 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -24,6 +24,7 @@
 #include "mace/ops/opencl/image/image_to_buffer.h"
 #include "mace/ops/opencl/buffer/buffer_transform.h"
 #include "mace/ops/common/transpose.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -34,11 +35,11 @@ class OpenCLBufferTransformer {
   OpenCLBufferTransformer(const MemoryType in_mem_type,
                           const MemoryType out_mem_type) {
     if (out_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_.reset(new opencl::image::BufferToImage<T>);
+      kernel_ = make_unique<opencl::image::BufferToImage<T>>();
     } else if (in_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_.reset(new opencl::image::ImageToBuffer<T>);
+      kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
     } else {
-      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+      kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
     }
   }
 
@@ -47,7 +48,7 @@ class OpenCLBufferTransformer {
                        const OpenCLBufferType type,
                        const MemoryType out_mem_type,
                        const int wino_blk_size,
-                       const DataFormat data_format,
+                       bool has_data_format,
                        Tensor *output) {
     Workspace *ws = context->workspace();
     DataType dt = DataTypeToEnum<T>::value;
@@ -66,13 +67,14 @@ class OpenCLBufferTransformer {
         VLOG(2) << "Transform CPU Buffer " << input->name()
                 << " to GPU Buffer " << internal_tensor->name()
                 << " with data type " << dt;
-        if (data_format == DataFormat::NHWC && input->shape().size() == 4) {
+        if (has_data_format && input->shape().size() == 4) {
           // 1. (NCHW -> NHWC)
           std::vector<int> dst_dims = {0, 2, 3, 1};
           std::vector<index_t> output_shape =
               TransposeShape<index_t, index_t>(input->shape(),
                                                dst_dims);
           internal_tensor->Resize(output_shape);
+          internal_tensor->set_data_format(DataFormat::NHWC);
           // TODO(liuqi): Only support float now
           const float *input_ptr = input->data<float>();
           Tensor::MappingGuard guard(internal_tensor);
@@ -104,13 +106,13 @@ class OpenCLBufferTransformer {
       VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
               << " to CPU Buffer " << output->name()
               << " with data type " << dt;
-      if (data_format == DataFormat::NHWC &&
-          internal_tensor.shape().size() == 4) {
+      if (has_data_format && internal_tensor.shape().size() == 4) {
         // NHWC -> NCHW
         std::vector<int> dst_dims = {0, 3, 1, 2};
         std::vector<index_t> output_shape =
             TransposeShape<index_t, index_t>(internal_tensor.shape(),
                                              dst_dims);
+        output->set_data_format(DataFormat::NCHW);
         Tensor::MappingGuard guard(&internal_tensor);
         const float *internal_ptr = internal_tensor.data<float>();
         output->Resize(output_shape);
diff --git a/mace/ops/opencl/channel_shuffle.h b/mace/ops/opencl/channel_shuffle.h
index 86634d75bc0bb0e13254ec0a9c82714f7b746fda..df4a4b0f8e7fd92a0e4663643aeb1ef66de04e79 100644
--- a/mace/ops/opencl/channel_shuffle.h
+++ b/mace/ops/opencl/channel_shuffle.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_CHANNEL_SHUFFLE_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/concat.h b/mace/ops/opencl/concat.h
index d657ffbe9bdd2f69301b9f519491433531822f8d..abeec7c62e25299ac4de95e0b0dadc61bdb35900 100644
--- a/mace/ops/opencl/concat.h
+++ b/mace/ops/opencl/concat.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/crop.h b/mace/ops/opencl/crop.h
index 88aceea6f93845ece318b8b26b45e25eaf24dbfc..b12c5ee00fed43c0f954921bac11a26fa21e0f7e 100644
--- a/mace/ops/opencl/crop.h
+++ b/mace/ops/opencl/crop.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/depth_to_space.h b/mace/ops/opencl/depth_to_space.h
index 17c03d453593ccb7ca1c1ae58890f80b01a5c706..9d2d4fcba65fe01545c2588cdb9d667f53408af7 100644
--- a/mace/ops/opencl/depth_to_space.h
+++ b/mace/ops/opencl/depth_to_space.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_DEPTH_TO_SPACE_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 namespace mace {
 
 class OpContext;
diff --git a/mace/ops/opencl/eltwise.h b/mace/ops/opencl/eltwise.h
index dec2b150d79a372a05895482a7db1819766b20e5..52156f06e908a394ae910abbeefb6da23e6cb236 100644
--- a/mace/ops/opencl/eltwise.h
+++ b/mace/ops/opencl/eltwise.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_ELTWISE_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/fully_connected.h b/mace/ops/opencl/fully_connected.h
index 8e421ad2f20510a76dcc0c5c841745d1832ac688..416aed6c8692ceaf45da1d1eb36f82b3753c8729 100644
--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -18,7 +18,7 @@
 #include "mace/ops/activation.h"
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc
index 8f3cd289bae5da40365cdefc9397c58eb0e7b1d1..46d4fd5b288d8463bfb44a5a879d9a93a5aebc70 100644
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -19,7 +19,7 @@
 #include <vector>
 
 #include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h
index 33ea688b51ab9cbc958af1e489959681061c3239..a4a49b4e15a021f1fa55fbd39c514777f03005bd 100644
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -21,12 +21,13 @@
 #include <vector>
 
 #include "mace/core/future.h"
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/core/types.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -41,8 +42,8 @@ namespace ops {
 
 #define MACE_OUT_OF_RANGE_INIT(kernel)                       \
   if (runtime->IsOutOfRangeCheckEnabled()) {                 \
-    oorc_flag = std::move(std::unique_ptr<Buffer>(           \
-        new Buffer((context)->device()->allocator())));      \
+    oorc_flag = make_unique<Buffer>(                         \
+        (context)->device()->allocator());                   \
     MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int)));\
     oorc_flag->Map(nullptr);                                 \
     *(oorc_flag->mutable_data<int>()) = 0;                   \
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index db63300eb7607dead1cc9661533e0e7d463e5e4b..125a973ae7de4409b31fa2a716c35409d5955d0e 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -16,7 +16,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 08568a5d9e39d671a2e3d84de8fc1fa22c588f95..7f0250cbc4ebc73cfa52c6041c9da8c95b7e3892 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -16,7 +16,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/ops/opencl/helper.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index 3ffb4fba69a8a79f46d188fbe9ddd9a2540759f1..e390a6ca69a2712dc1959c75ece199255011a173 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -34,16 +34,14 @@ template <typename T>
 class CropKernel : public OpenCLCropKernel {
  public:
   explicit CropKernel(
-      const int axis,
       const std::vector<int> &offset)
-      : axis_(axis), offset_(offset) {}
+      : offset_(offset) {}
   MaceStatus Compute(
       OpContext *context,
       const std::vector<const Tensor *> &input_list,
       Tensor *output) override;
 
  private:
-  const int axis_;
   std::vector<int> offset_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
@@ -68,57 +66,14 @@ MaceStatus CropKernel<T>::Compute(
   std::vector<int32_t> offsets(4, 0);
 
   std::vector<index_t> output_shape(input0->shape());
-  switch (axis_) {
-    case 0:
-      if (offset_.size() == 1) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 4) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[2];
-        offsets[2] = offset_[3];
-        offsets[3] = offset_[1];
-      }
-      for (int i = 0; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 1:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 3) {
-        offsets[1] = offset_[1];
-        offsets[2] = offset_[2];
-        offsets[3] = offset_[0];
-      }
-      for (int i = 1; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 2:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-      } else if (offset_.size() == 2) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[1];
-      }
-      output_shape[1] = input1->dim(1);
-      output_shape[2] = input1->dim(2);
-      break;
-    case 3:
-      if (offset_.size() == 1) {
-        offsets[2] = offset_[0];
-      }
-      output_shape[2] = input1->dim(2);
-      break;
-    default:
-      MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
-      break;
+  for (index_t i = 0; i < in0_dims; ++i) {
+    if (offset_[i] >= 0) {
+      output_shape[i] = input1->dim(i);
+      offsets[i] = offset_[i];
+      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+        << "the crop for dimension " << i << " is out of bound with size "
+        << input1->dim(i) << " and offset " << offsets[i];
+    }
   }
   MACE_CHECK(offsets[3] % 4 == 0,
              "MACE opencl only supports cropping channel"
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index 527d6cc87f0b8e5023100a9d403f363d66db5871..27a0bc30533f4538a537dc6c3084178ee1d5d3cd 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -17,7 +17,8 @@
 #include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -264,9 +265,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                               OpenCLBufferType::IN_OUT_HEIGHT,
                               &t_input_image_shape);
   ScratchImage transformed_input_image(scratch_manager);
-  std::unique_ptr<Tensor> transformed_input(new Tensor(
+  std::unique_ptr<Tensor> transformed_input = make_unique<Tensor>(
       transformed_input_image.Scratch(context->device()->allocator(),
-                                      t_input_image_shape, dt), dt));
+                                      t_input_image_shape, dt), dt);
   MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
                                                       t_input_image_shape));
   MACE_RETURN_IF_ERROR(WinogradInputTransform(
@@ -289,9 +290,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                               &mm_output_image_shape);
 
   ScratchImage mm_output_image(scratch_manager);
-  std::unique_ptr<Tensor> mm_output(new Tensor(
+  std::unique_ptr<Tensor> mm_output = make_unique<Tensor>(
       mm_output_image.Scratch(context->device()->allocator(),
-                              mm_output_image_shape, dt), dt));
+                              mm_output_image_shape, dt), dt);
   MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
                                               mm_output_image_shape));
 
diff --git a/mace/ops/opencl/lstm_cell.h b/mace/ops/opencl/lstm_cell.h
index 07ea2e65551092d5c8dcfe561a2dcaecc8c9261a..4dee034c12fa858fea262e2977d5383b5863e9b3 100644
--- a/mace/ops/opencl/lstm_cell.h
+++ b/mace/ops/opencl/lstm_cell.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_LSTM_CELL_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/matmul.h b/mace/ops/opencl/matmul.h
index c51c83c146fdc7834de05522f8ab6939f4974673..05879f8ae2ed8623652316d13e0526e48a584b3b 100644
--- a/mace/ops/opencl/matmul.h
+++ b/mace/ops/opencl/matmul.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_MATMUL_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc
index 61e19808d1dad91045876e75e9b525c042c78427..8909f35113c5a77d78cf614970d9d027019f111c 100644
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
@@ -22,6 +22,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/ops/opencl/helper.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -130,7 +131,8 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   index_t channels = 11;
 
   GPUContext gpu_context;
-  std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
+  std::unique_ptr<Device> device = make_unique<GPUDevice>(
+      gpu_context.opencl_tuner());
 
   Workspace ws;
   OpContext context(&ws, device.get());
diff --git a/mace/ops/opencl/pad.h b/mace/ops/opencl/pad.h
index cfc7edb3a1351e1b9bf8d1a152e4d906c6f09d47..640137691964b9e57cca9e69ee0c73a8d85420f0 100644
--- a/mace/ops/opencl/pad.h
+++ b/mace/ops/opencl/pad.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_PAD_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 namespace mace {
 
 class OpContext;
diff --git a/mace/ops/opencl/reduce.h b/mace/ops/opencl/reduce.h
index 4f6aa2187561a22ac0e6758b45738a73a6bf9fa7..f653f8b02805dfb387d38b617eed2256e70255d9 100644
--- a/mace/ops/opencl/reduce.h
+++ b/mace/ops/opencl/reduce.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_REDUCE_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/resize_bicubic.h b/mace/ops/opencl/resize_bicubic.h
index 4fde45453b6335e2ca1a1eab8c57f909b253e97b..b7fd71a0dbcddc85322fae1b2a8973a1b63af1b5 100644
--- a/mace/ops/opencl/resize_bicubic.h
+++ b/mace/ops/opencl/resize_bicubic.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_RESIZE_BICUBIC_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 #include "mace/core/types.h"
 
 namespace mace {
diff --git a/mace/ops/opencl/resize_bilinear.h b/mace/ops/opencl/resize_bilinear.h
index 18dd312845b0bcba5ed91a1f9ad5aa0311e2279a..66035d8511136952eeefe92efce0a3fd614aad5e 100644
--- a/mace/ops/opencl/resize_bilinear.h
+++ b/mace/ops/opencl/resize_bilinear.h
@@ -17,7 +17,7 @@
 
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/resize_nearest_neighbor.h b/mace/ops/opencl/resize_nearest_neighbor.h
index fda220aee9704228d435a304001a5f679f2d28e3..b0178827ac6190d413b179b4a98c367d1e5f9c37 100644
--- a/mace/ops/opencl/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/resize_nearest_neighbor.h
@@ -17,7 +17,7 @@
 
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/softmax.h b/mace/ops/opencl/softmax.h
index caca5dc693a6d2e8735edc91a0f5c9f0feab65c4..a4a439ec1501db36a417c2d533d14cf1bea103e5 100644
--- a/mace/ops/opencl/softmax.h
+++ b/mace/ops/opencl/softmax.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_SOFTMAX_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/space_to_batch.h b/mace/ops/opencl/space_to_batch.h
index 350bf120975c6e3747e8cb5c848d9eb88f646d71..9f73ff5acdfaaa7956219bff51c42c4beb9c40b4 100644
--- a/mace/ops/opencl/space_to_batch.h
+++ b/mace/ops/opencl/space_to_batch.h
@@ -19,7 +19,7 @@
 
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/space_to_depth.h b/mace/ops/opencl/space_to_depth.h
index 69e40c82a0e3d7d06cd181d52bb71f7f1b7bd8e0..454cb686e7d1589c3c1329cc97d7a60b3c9ed663 100644
--- a/mace/ops/opencl/space_to_depth.h
+++ b/mace/ops/opencl/space_to_depth.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_SPACE_TO_DEPTH_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/split.h b/mace/ops/opencl/split.h
index 61f75d85c6f2c3032eacbeb908bb79edf61c26ea..8c7ac5636b77a1d07449d6f8ce09c77a6934b537 100644
--- a/mace/ops/opencl/split.h
+++ b/mace/ops/opencl/split.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/sqrdiff_mean.h b/mace/ops/opencl/sqrdiff_mean.h
index 822b992f0009726210126bcb19d06f480dcbf7ca..781a08c56568bf6f27230abd109d53ab24faa7e3 100644
--- a/mace/ops/opencl/sqrdiff_mean.h
+++ b/mace/ops/opencl/sqrdiff_mean.h
@@ -16,7 +16,7 @@
 #define MACE_OPS_OPENCL_SQRDIFF_MEAN_H_
 
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 namespace mace {
 
 class OpContext;
diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc
index 58f6572756ac1a02b53bdac6c7de1c1622679684..7fc3545883ee855a578c001ac3ff75ff574261b6 100644
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
@@ -29,6 +29,7 @@ extern void RegisterChannelShuffle(OpRegistryBase *op_registry);
 extern void RegisterConcat(OpRegistryBase *op_registry);
 extern void RegisterConv2D(OpRegistryBase *op_registry);
 extern void RegisterCrop(OpRegistryBase *op_registry);
+extern void RegisterCumsum(OpRegistryBase *op_registry);
 extern void RegisterDeconv2D(OpRegistryBase *op_registry);
 extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
 extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
@@ -44,6 +45,7 @@ extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
 extern void RegisterMatMul(OpRegistryBase *op_registry);
 extern void RegisterOneHot(OpRegistryBase *op_registry);
 extern void RegisterPad(OpRegistryBase *op_registry);
+extern void RegisterPNorm(OpRegistryBase *op_registry);
 extern void RegisterPooling(OpRegistryBase *op_registry);
 extern void RegisterReduce(OpRegistryBase *op_registry);
 extern void RegisterPriorBox(OpRegistryBase *op_registry);
@@ -54,14 +56,19 @@ extern void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry);
 extern void RegisterReverse(OpRegistryBase *op_registry);
 extern void RegisterScalarMath(OpRegistryBase *op_registry);
 extern void RegisterShape(OpRegistryBase *op_registry);
+extern void RegisterSlice(OpRegistryBase *op_registry);
 extern void RegisterSoftmax(OpRegistryBase *op_registry);
 extern void RegisterSpaceToBatchND(OpRegistryBase *op_registry);
 extern void RegisterSpaceToDepth(OpRegistryBase *op_registry);
+extern void RegisterSplice(OpRegistryBase *op_registry);
 extern void RegisterSplit(OpRegistryBase *op_registry);
 extern void RegisterSqrDiffMean(OpRegistryBase *op_registry);
 extern void RegisterSqueeze(OpRegistryBase *op_registry);
 extern void RegisterStack(OpRegistryBase *op_registry);
 extern void RegisterStridedSlice(OpRegistryBase *op_registry);
+extern void RegisterSumGroup(OpRegistryBase *op_registry);
+extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry);
+extern void RegisterTimeOffset(OpRegistryBase *op_registry);
 extern void RegisterTranspose(OpRegistryBase *op_registry);
 extern void RegisterUnstack(OpRegistryBase *op_registry);
 
@@ -90,6 +97,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
   ops::RegisterConcat(this);
   ops::RegisterConv2D(this);
   ops::RegisterCrop(this);
+  ops::RegisterCumsum(this);
   ops::RegisterDeconv2D(this);
   ops::RegisterDepthToSpace(this);
   ops::RegisterDepthwiseConv2d(this);
@@ -105,6 +113,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
   ops::RegisterMatMul(this);
   ops::RegisterOneHot(this);
   ops::RegisterPad(this);
+  ops::RegisterPNorm(this);
   ops::RegisterPooling(this);
   ops::RegisterReduce(this);
   ops::RegisterPriorBox(this);
@@ -115,14 +124,19 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
   ops::RegisterReverse(this);
   ops::RegisterScalarMath(this);
   ops::RegisterShape(this);
+  ops::RegisterSlice(this);
   ops::RegisterSoftmax(this);
   ops::RegisterSpaceToBatchND(this);
   ops::RegisterSpaceToDepth(this);
+  ops::RegisterSplice(this);
   ops::RegisterSplit(this);
   ops::RegisterStack(this);
   ops::RegisterStridedSlice(this);
   ops::RegisterSqrDiffMean(this);
   ops::RegisterSqueeze(this);
+  ops::RegisterSumGroup(this);
+  ops::RegisterTargetRMSNorm(this);
+  ops::RegisterTimeOffset(this);
   ops::RegisterTranspose(this);
   ops::RegisterUnstack(this);
 
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index ce9c1bbde07ddd8857f33718f06eb47d1fb34fa9..25de146a59db15f456a0941c14222fc30a5a54e7 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -14,6 +14,7 @@
 
 #include "mace/ops/ops_test_util.h"
 #include "mace/core/memory_optimizer.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -120,17 +121,16 @@ OpTestContext *OpTestContext::Get(int num_threads,
 OpTestContext::OpTestContext(int num_threads,
                              CPUAffinityPolicy cpu_affinity_policy,
                              bool use_gemmlowp)
-    : gpu_context_(new GPUContext(GetStoragePathFromEnv())),
+    : gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())),
       opencl_mem_types_({MemoryType::GPU_IMAGE}) {
-  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
-      new CPUDevice(num_threads,
-                    cpu_affinity_policy,
-                    use_gemmlowp));
-
-  device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
-      new GPUDevice(gpu_context_->opencl_tuner(),
-                    gpu_context_->opencl_cache_storage(),
-                    GPUPriorityHint::PRIORITY_NORMAL));
+  device_map_[DeviceType::CPU] = make_unique<CPUDevice>(
+      num_threads, cpu_affinity_policy, use_gemmlowp);
+
+  device_map_[DeviceType::GPU] = make_unique<GPUDevice>(
+      gpu_context_->opencl_tuner(),
+      gpu_context_->opencl_cache_storage(),
+      GPUPriorityHint::PRIORITY_NORMAL,
+      GPUPerfHint::PERF_HIGH);
 }
 
 std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
@@ -167,9 +167,20 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
           !ws_.GetTensor(input)->is_weight()) {
         auto input_info = net_def.add_input_info();
         input_info->set_name(input);
-        auto data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "data_format", DataFormat::DF_NONE);
-        input_info->set_data_format(data_format);
+        auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            op_def, "has_data_format", 1);
+        auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            op_def, "T", static_cast<int>(DT_FLOAT))
+            == static_cast<int>(DT_UINT8);
+        if (has_data_format) {
+          if (is_quantized_op || device == DeviceType::GPU) {
+            input_info->set_data_format(NHWC);
+          } else {
+            input_info->set_data_format(NCHW);
+          }
+        } else {
+          input_info->set_data_format(DataFormat::DF_NONE);
+        }
         auto &shape = ws_.GetTensor(input)->shape();
         for (auto d : shape) {
           input_info->add_dims(static_cast<int>(d));
@@ -177,24 +188,26 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
       }
     }
   }
-  auto op_def = op_defs_.back();
-  for (int i = 0; i < op_def.output_size(); ++i) {
-    ws_.RemoveTensor(op_def.output(i));
-    auto output_info = net_def.add_output_info();
-    output_info->set_name(op_def.output(i));
-    if (op_def.output_type_size() == op_def.output_size()) {
-      output_info->set_data_type(op_def.output_type(i));
-    } else {
-      output_info->set_data_type(DataType::DT_FLOAT);
+  if (!op_defs_.empty()) {
+    auto op_def = op_defs_.back();
+    for (int i = 0; i < op_def.output_size(); ++i) {
+      ws_.RemoveTensor(op_def.output(i));
+      auto output_info = net_def.add_output_info();
+      output_info->set_name(op_def.output(i));
+      if (op_def.output_type_size() == op_def.output_size()) {
+        output_info->set_data_type(op_def.output_type(i));
+      } else {
+        output_info->set_data_type(DataType::DT_FLOAT);
+      }
     }
   }
   MemoryOptimizer mem_optimizer;
-  net_ = std::unique_ptr<NetBase>(new SerialNet(
+  net_ = make_unique<SerialNet>(
       op_registry_.get(),
       &net_def,
       &ws_,
       OpTestContext::Get()->GetDevice(device),
-      &mem_optimizer));
+      &mem_optimizer);
   MaceStatus status = (ws_.PreallocateOutputTensor(
       net_def,
       &mem_optimizer,
@@ -236,12 +249,12 @@ MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
                               const mace::DeviceType device) {
   device_type_ = device;
   MemoryOptimizer mem_optimizer;
-  net_ = std::unique_ptr<NetBase>(new SerialNet(
+  net_ = make_unique<SerialNet>(
       op_registry_.get(),
       &net_def,
       &ws_,
       OpTestContext::Get()->GetDevice(device),
-      &mem_optimizer));
+      &mem_optimizer);
   MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
       net_def,
       &mem_optimizer,
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 07cbad06bdb57381ca3befada4baf1e1f11b5bed..8226079711535766f30e06626b80110c4883b82a 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -34,7 +34,8 @@
 #include "mace/core/workspace.h"
 #include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 #include "mace/utils/quantize.h"
 #include "mace/ops/testing/test_utils.h"
 
@@ -97,7 +98,7 @@ class OpTestContext {
 class OpsTestNet {
  public:
   OpsTestNet() :
-    op_registry_(new OpRegistry()) {}
+    op_registry_(make_unique<OpRegistry>()) {}
 
   template <DeviceType D, typename T>
   void AddInputFromArray(const std::string &name,
@@ -258,9 +259,9 @@ class OpsTestNet {
 
   template <DeviceType D, typename T>
   void TransformFilterDataFormat(const std::string &src_name,
-                                 const FilterDataFormat src_format,
+                                 const DataFormat src_format,
                                  const std::string &dst_name,
-                                 const FilterDataFormat dst_format) {
+                                 const DataFormat dst_format) {
     Tensor *input = ws_.GetTensor(src_name);
     Tensor *output = ws_.CreateTensor(
         dst_name,
@@ -355,9 +356,9 @@ class OpsTestNet {
   std::unique_ptr<Tensor> CreateTensor(
       const std::vector<index_t> &shape = {},
       const std::vector<T> &data = {}) {
-    std::unique_ptr<Tensor> res(
-        new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(),
-                   DataTypeToEnum<T>::v()));
+    std::unique_ptr<Tensor> res = make_unique<Tensor>(
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v());
     if (!data.empty()) {
       res->Resize(shape);
       T *input_data = res->mutable_data<T>();
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index 0dfdf673b21f49ce231030251ed78004971e0b3f..aaa6b230f4b5237dc88d16e369dcf289a8fe9df6 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -20,6 +20,8 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pad.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -39,9 +41,9 @@ class PadOp<DeviceType::CPU, T> : public Operation {
         constant_value_(Operation::GetOptionalArg<float>(
             "constant_value", 0.0)) {
     MACE_CHECK(paddings_.size() == 8);
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df) {
       paddings_ = TransposeShape<int, int>(paddings_, {0, 1, 6, 7, 2, 3, 4, 5});
     }
   }
@@ -54,11 +56,9 @@ class PadOp<DeviceType::CPU, T> : public Operation {
         this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
     auto input_shape = input->shape();
     for (size_t i = 0; i < paddings_.size(); ++i) {
-      if (type_ == PadType::REFLECT) {
-        MACE_CHECK(paddings_[i] < input_shape[i / 2]);
-
-      } else if (type_ == PadType::SYMMETRIC) {
-        MACE_CHECK(paddings_[i] <= input_shape[i / 2]);
+      if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) {
+        MACE_CHECK(paddings_[i] < input_shape[i / 2], paddings_[i],
+                   " vs ", input_shape[i / 2]);
       }
       MACE_CHECK(paddings_[i] >= 0);
     }
@@ -182,8 +182,8 @@ class PadOp<DeviceType::GPU, T> : public Operation {
     float constant_value = Operation::GetOptionalArg<float>(
         "constant_value", 0.0);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::PadKernel<T>(
-          type, paddings, constant_value));
+      kernel_ = make_unique<opencl::image::PadKernel<T>>(
+          type, paddings, constant_value);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index 0466aa6be486d5f5917f4397006e5cdc4619179e..b449e02f9166c21620daf289baac89b34c25b37f 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -29,7 +29,11 @@ void Pad(int iters, int batch, int height,
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  }
 
   const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
   OpDefBuilder("Pad", "PadTest")
@@ -37,6 +41,7 @@ void Pad(int iters, int batch, int height,
       .Output("Output")
       .AddIntsArg("paddings", paddings)
       .AddIntArg("pad_type", pad_type)
+      .AddIntArg("has_data_format", 1)
       .AddFloatArg("constant_value", 1.0)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index 63bb449f25057ae8335dc95a6d52042dec2186c6..e68e8eb8d06b864b9c9173ada5fbb2312ec0566c 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -39,7 +39,7 @@ void SimpleConstant() {
         .Output("Output")
         .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
         .AddFloatArg("constant_value", 1.0)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Finalize(net.NewOperatorDef());
 
     // Run
@@ -52,7 +52,7 @@ void SimpleConstant() {
         .Output("TOutput")
         .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
         .AddFloatArg("constant_value", 1.0)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Finalize(net.NewOperatorDef());
 
     // Run
@@ -101,7 +101,7 @@ void Result(const std::vector<index_t> &input_shape,
   .Output(t_output)
   .AddIntsArg("paddings", paddings)
   .AddIntArg("pad_type", static_cast<int>(pad_type))
-  .AddIntArg("data_format", DataFormat::NHWC)
+  .AddIntArg("has_data_format", 1)
   .Finalize(net.NewOperatorDef());
 
   // Run
@@ -179,7 +179,7 @@ TEST_F(PadTest, ComplexCPU) {
       .Output("TOutput")
       .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1})
       .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -217,7 +217,7 @@ void Complex(const std::vector<index_t> &input_shape,
       .AddIntsArg("paddings", paddings)
       .AddIntArg("pad_type", pad_type)
       .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Finalize(net.NewOperatorDef());
 
   // Run
@@ -234,7 +234,7 @@ void Complex(const std::vector<index_t> &input_shape,
       .AddIntsArg("paddings", paddings)
       .AddIntArg("pad_type", pad_type)
       .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Finalize(net.NewOperatorDef());
 
   // Run
diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8742a3b4492cb36aab4deece867a2021c4afd106
--- /dev/null
+++ b/mace/ops/pnorm.cc
@@ -0,0 +1,133 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is for PNormComponent in Kaldi.
+// The input-dim must be dividable by output-dim.
+// The output will be divided to output-dim group,
+// so input-dim should be dividable by output-dim.
+// For each row:
+// p is 0: output[i] = sum(abs(input[i*group + j]) > 0)
+// p is 1: output[i] = sum(abs(input[i*group + j]))
+// p is 2: output[i] = sqrt(sum(input[i * group + j] * input[i * group + j])),
+// for j = (0 : group - 1)
+// p's default value is 2.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class PNormOp;
+
+template <typename T>
+class PNormOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit PNormOp(OpConstructContext *context)
+      : Operation(context),
+        p_(Operation::GetOptionalArg<int>("p", 2)),
+        output_dim_(Operation::GetOptionalArg<int>("output_dim", 0)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+
+
+    const std::vector<index_t> &input_shape = input->shape();
+    const index_t dim_size = input_shape.size();
+    MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1");
+    std::vector<index_t> output_shape(input_shape);
+    const index_t input_dim = input_shape[dim_size -1];
+    MACE_CHECK(output_dim_ > 0,
+               "Output dim should be greater than zero.");
+    MACE_CHECK(input_dim % output_dim_ == 0 && output_dim_ < input_dim,
+               "PNorm's input dim should be a multiple of output dim.");
+    const index_t group_size = input_dim / output_dim_;
+    output_shape[dim_size - 1] = output_dim_;
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard guard_input(input);
+    Tensor::MappingGuard guard_output(output);
+
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+    const index_t bh =
+        std::accumulate(input->shape().begin(), input->shape().end() - 1, 1,
+                        std::multiplies<index_t>());
+    if (p_ == 0) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (index_t i = 0; i < bh; ++i) {
+        for (index_t j = 0; j < output_dim_; ++j) {
+          const T *in_base = input_data + i * input_dim + j * group_size;
+          T *out_base = output_data + i * output_dim_;
+          T temp_result = 0;
+          for (index_t g = 0; g < group_size; ++g) {
+            T value =
+                (std::fabs(in_base[g])
+                    > std::numeric_limits<float>::epsilon()) ? 1.0f : 0.0f;
+            temp_result += value;
+          }
+          out_base[j] = temp_result;
+        }
+      }
+    } else if (p_ == 1) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (index_t i = 0; i < bh; ++i) {
+        for (index_t j = 0; j < output_dim_; ++j) {
+          const T *in_base = input_data + i * input_dim + j * group_size;
+          T *out_base = output_data + i * output_dim_;
+          T temp_result = 0;
+          for (index_t g = 0; g < group_size; ++g) {
+            temp_result += std::abs(in_base[g]);;
+          }
+          out_base[j] = temp_result;
+        }
+      }
+    } else if (p_ == 2) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (index_t i = 0; i < bh; ++i) {
+        for (index_t j = 0; j < output_dim_; ++j) {
+          const T *in_base = input_data + i * input_dim + j * group_size;
+          T *out_base = output_data + i * output_dim_;
+          T temp_result = 0;
+          for (index_t g = 0; g < group_size; ++g) {
+            temp_result += in_base[g] * in_base[g];
+          }
+          out_base[j] = std::sqrt(temp_result);
+        }
+      }
+    } else {
+      LOG(FATAL) << "PNorm's p should be 0, 1 or 2, here p is: " << p_;
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  int p_;
+  int output_dim_;
+};
+
+void RegisterPNorm(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "PNorm", PNormOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/pnorm_benchmark.cc b/mace/ops/pnorm_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3af765cd22f3589abd602dc6e28cd96acc2ee0f
--- /dev/null
+++ b/mace/ops/pnorm_benchmark.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void PNormBenchmark(int iters, int n, int h, int w, int p, int ow) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {n, h, w});
+
+  OpDefBuilder("PNorm", "PNormBM")
+      .Input("Input")
+      .AddIntArg("p", p)
+      .AddIntArg("output_dim", ow)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+    net.Sync();
+  }
+}
+}  // namespace
+
+#define MACE_BM_PNORM_MACRO(N, H, W, P, OW, TYPE, DEVICE)  \
+  static void                                                    \
+      MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE( \
+          int iters) {                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));          \
+    PNormBenchmark<DEVICE, TYPE>(iters, N, H, W, P, OW);   \
+  }                                                              \
+  MACE_BENCHMARK(                                                \
+      MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE)
+
+#define MACE_BM_PNORM(N, H, W, P, OW)             \
+  MACE_BM_PNORM_MACRO(N, H, W, P, OW, float, CPU);
+
+MACE_BM_PNORM(1, 10, 256, 0, 128);
+MACE_BM_PNORM(1, 20, 128, 1, 64);
+MACE_BM_PNORM(1, 10, 128, 2, 64);
+MACE_BM_PNORM(1, 16, 256, 0, 128);
+MACE_BM_PNORM(1, 32, 128, 1, 64);
+MACE_BM_PNORM(1, 10, 512, 2, 256);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/pnorm_test.cc b/mace/ops/pnorm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3510868250cadd655ffe345855973e39d2a0e534
--- /dev/null
+++ b/mace/ops/pnorm_test.cc
@@ -0,0 +1,70 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class PNormOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestPNorm(const std::vector<index_t> &input_shape,
+               const std::vector<T> &input,
+               const int p,
+               const int output_dim,
+               const std::vector<index_t> &output_shape,
+               const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+
+  OpDefBuilder("PNorm", "PNormTest")
+      .Input("Input")
+      .AddIntArg("p", p)
+      .AddIntArg("output_dim", output_dim)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(PNormOpTest, SimpleTest) {
+  TestPNorm<DeviceType::CPU, float>(
+    {1, 5, 10},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+     7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
+    2, 5,
+    {1, 5, 5},
+    {2.236067977, 5, 7.810249676, 10.630145813, 13.453624047,
+     5, 7.810249676, 10.630145813, 13.453624047, 16.278820596,
+     7.810249676, 10.630145813, 13.453624047, 16.278820596, 19.104973175,
+     10.630145813, 13.453624047, 16.278820596, 19.104973175, 21.931712199,
+     13.453624047, 16.278820596, 19.104973175, 21.931712199, 24.758836806});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 8fd87cdfa38771a56636fd7bd54894ea1cbe042e..969f2774e3bb5a5fcf35e37e5f613f2f87b9f19b 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -32,6 +32,7 @@
 #include "mace/ops/opencl/image/pooling.h"
 #include "mace/ops/opencl/buffer/pooling.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -433,10 +434,10 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
   explicit PoolingOp(OpConstructContext *context)
       : PoolingOpBase(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::PoolingKernel<T>);
+      kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
-      kernel_.reset(new opencl::buffer::PoolingKernel<T>);
+      kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
     }
   }
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index f4a147cc7b8191f5323cf38acd532830a44948c9..068212f204d85a3129d1f7ad9e9cbe0cfca06491 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -25,6 +25,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/reduce.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -84,7 +85,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
  private:
   void Simplify(const Tensor *input) {
     std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
-    if (axis_.size() == 0) {
+    if (axis_.empty()) {
       for (int i = 0; i < input->dim_size(); ++i) {
         bitmap[i] = true;
       }
@@ -93,9 +94,9 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
         int index = axis_[i] >= 0 ?
                           axis_[i] :
                           axis_[i] + input->dim_size();
-        auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", DataFormat::DF_NONE));
-        if (df == DataFormat::NHWC && DataTypeToEnum<T>::value != DT_UINT8
+        auto has_df = Operation::GetOptionalArg<int>(
+            "has_data_format", 0);
+        if (has_df && DataTypeToEnum<T>::value != DT_UINT8
             && input->dim_size() == 4) {
           if (index == 1 || index == 2) index = index + 1;
           else if (index == 3) index = 1;
@@ -847,9 +848,9 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
   explicit ReduceOp(OpConstructContext *context)
       : ReduceOpBase(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ReduceKernel<T>(reduce_type_,
-                                                       axis_,
-                                                       keep_dims_));
+      kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
+                                                            axis_,
+                                                            keep_dims_);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc
index d97131672c2fba7d988b0e5118a410b54acc571a..1d5fbe33ccb10dc7ffbef9b00353ed93889691fd 100644
--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
@@ -38,6 +38,7 @@ void Reduce(int iters, int batch, int channels,
       .Input("Input")
       .AddIntsArg("axis", axis)
       .Output("OutputImage")
+      .AddIntArg("has_data_format", 1)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc
index 78a9f9345a8ca4da9eae0a0beedcb8dd1fbed49c..fc284084b25dfe7aac2c6fb936953dbe98e75212 100644
--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -44,7 +44,7 @@ void Simple(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", keepdims ? 1 : 0)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Output("OutputNCHW")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -56,7 +56,7 @@ void Simple(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", keepdims ? 1 : 0)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -84,7 +84,7 @@ void Simple3D(const std::vector<index_t> &input_shape,
       .AddIntsArg("axis", axis)
       .AddIntArg("keepdims", keepdims ? 1 : 0)
       .AddIntArg("reduce_type", type)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
   // Run
@@ -588,7 +588,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", 1)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Output("OutputNCHW")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -600,7 +600,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", 1)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Output("OPENCLOutput")
         .Finalize(net.NewOperatorDef());
     // Run
@@ -662,7 +662,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", 1)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .Output("OutputNCHW")
         .AddIntArg("T", DT_FLOAT)
         .Finalize(net.NewOperatorDef());
@@ -687,7 +687,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", 1)
         .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
         .AddIntArg("T", DT_UINT8)
         .Finalize(net.NewOperatorDef());
     net.RunOp();
diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc
index 4707d9229bd9ce5cac322bcc8b0294521e061062..e5b7952a334b8fb5bcc4d13d8264fc6f76d8c41d 100644
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
@@ -16,7 +16,6 @@
 #include "mace/ops/ref/conv_2d.h"
 
 #include <vector>
-#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -30,31 +29,36 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
 
   const std::vector<index_t> in_shape = input->shape();
   const std::vector<index_t> filter_shape = filter->shape();
-  const std::vector<index_t> out_shape = output->shape();
-  const std::vector<int> stride_hw{stride_h_, stride_w_};
-  const std::vector<int> dilation_hw{dilation_h_, dilation_w_};
-  const std::vector<int> paddings{pad_h_, pad_w_};
-  const index_t pad_top = pad_h_ >> 1;
-  const index_t pad_left = pad_w_ >> 1;
-
-  std::vector<index_t> output_shape(4);
-
-  CalcOutputSize(in_shape.data(),
-                 NCHW,
-                 filter_shape.data(),
-                 OIHW,
-                 paddings.data(),
-                 dilation_hw.data(),
-                 stride_hw.data(),
-                 RoundType::FLOOR,
-                 output_shape.data());
-  output->Resize(output_shape);
-
+  std::vector<index_t> out_shape(4);
+
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 out_shape.data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       out_shape.data());
+  }
+  const index_t pad_top = paddings[0] >> 1;
+  const index_t pad_left = paddings[1] >> 1;
+  output->Resize(out_shape);
   const index_t in_image_size = in_shape[2] * in_shape[3];
   const index_t out_image_size = out_shape[2] * out_shape[3];
   const index_t in_batch_size = filter_shape[1] * in_image_size;
   const index_t out_batch_size = filter_shape[0] * out_image_size;
   const index_t filter_size = filter_shape[2] * filter_shape[3];
+
   Tensor::MappingGuard input_guard(input);
   Tensor::MappingGuard filter_guard(filter);
   Tensor::MappingGuard output_guard(output);
@@ -86,8 +90,10 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
 
             for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
               for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_;
-                const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_;
+                const index_t
+                    ih = -pad_top + h * strides_[0] + kh * dilations_[0];
+                const index_t
+                    iw = -pad_left + w * strides_[1] + kw * dilations_[1];
                 if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                   sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
                 }
diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h
index e99af5cf0093dd7ab419d7b321ed36bf941bfeb3..10baac8cb86abcdd1f88993ae12fb752f589fcb7 100644
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
@@ -16,9 +16,12 @@
 #ifndef MACE_OPS_REF_CONV_2D_H_
 #define MACE_OPS_REF_CONV_2D_H_
 
+#include <vector>
+
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
 namespace ops {
@@ -27,30 +30,39 @@ namespace ref {
 template<typename OUTPUT_TYPE>
 class Conv2d {
  public:
-  Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w);
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
   ~Conv2d() {}
   MaceStatus Compute(
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 template<>
 class Conv2d<float> {
  public:
-  Conv2d(int pad_h,
-         int pad_w,
-         int stride_h,
-         int stride_w,
-         int dilation_h,
-         int dilation_w)
-      : pad_h_(pad_h),
-        pad_w_(pad_w),
-        stride_h_(stride_h),
-        stride_w_(stride_w),
-        dilation_h_(dilation_h),
-        dilation_w_(dilation_w) {}
+  Conv2d(const std::vector<int> strides,
+         const std::vector<int> dilations,
+         const std::vector<int> paddings,
+         const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
   ~Conv2d() {}
   // Always row-major after transpose
   MaceStatus Compute(
@@ -60,12 +72,10 @@ class Conv2d<float> {
       Tensor *output);
 
  private:
-  int pad_h_;
-  int pad_w_;
-  int stride_h_;
-  int stride_w_;
-  int dilation_h_;
-  int dilation_w_;
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 
 }  // namespace ref
diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc
index f082cf31a9dbf35aad4ce2ca65c5f4cb6d5679e7..98ea215e7678b32170bf98d415b0c88ec23a60e6 100644
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -15,6 +15,7 @@
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -23,16 +24,12 @@ template <DeviceType D, class T>
 class ReshapeOp : public Operation {
  public:
   explicit ReshapeOp(OpConstructContext *context)
-      : Operation(context) {}
+      : Operation(context),
+        has_df_(Operation::GetOptionalArg<int>("has_data_format", 0)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
     const Tensor *input = this->Input(INPUT);
-    const std::vector<index_t> &input_shape = input->shape();
-    int axis = Operation::GetOptionalArg<int>("reshape_axis", 0);
-    int num_axes = Operation::GetOptionalArg<int>("num_axes", -1);
-    MACE_CHECK(axis == 0 && num_axes == -1,
-               "Only support axis = 0 and num_axes = -1");
     const Tensor *shape = this->Input(SHAPE);
     const index_t num_dims = shape->dim_size() == 0 ? 0 : shape->dim(0);
     Tensor::MappingGuard shape_guard(shape);
@@ -40,20 +37,16 @@ class ReshapeOp : public Operation {
 
     int unknown_idx = -1;
     index_t product = 1;
-    std::vector<index_t> out_shape;
+    std::vector<index_t> out_shape(num_dims);
     index_t n = 0;
 
     for (int i = 0; i < num_dims; ++i) {
       if (shape_data[i] == -1) {
         MACE_CHECK(unknown_idx == -1, "Only one input size may be -1");
         unknown_idx = i;
-        out_shape.push_back(1);
-      } else if (shape_data[i] == 0) {
-        MACE_CHECK(shape_data[i] == 0, "Shape should be 0");
-        out_shape.push_back(input_shape[i]);
-        product *= input_shape[i];
+        out_shape[i] = 1;
       } else {
-        MACE_CHECK(shape_data[i] > 0, "Shape must be non-negative: ",
+        MACE_CHECK(shape_data[i] >= 0, "Shape must be non-negative: ",
                    shape_data[i]);
         if (shape_data[i] == 0) {
           MACE_CHECK(i < input->dim_size(),
@@ -62,7 +55,7 @@ class ReshapeOp : public Operation {
         } else {
           n = shape_data[i];
         }
-        out_shape.push_back(n);
+        out_shape[i] = n;
         product *= n;
       }
     }
@@ -77,14 +70,13 @@ class ReshapeOp : public Operation {
     }
     Tensor *output = this->Output(OUTPUT);
     // NHWC -> NCHW
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC && D == DeviceType::CPU
+
+    if (has_df_ && D == DeviceType::CPU
         && out_shape.size() == 4 && shape->is_weight()) {
       std::vector<int> dst_dims = {0, 3, 1, 2};
-      std::vector<index_t> out_shape_gpu = TransposeShape<index_t, index_t>(
+      std::vector<index_t> trans_shape = TransposeShape<index_t, index_t>(
           out_shape, dst_dims);
-      out_shape = out_shape_gpu;
+      out_shape = trans_shape;
     }
 
     output->ReuseTensorBuffer(*input);
@@ -93,6 +85,9 @@ class ReshapeOp : public Operation {
     return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
+  bool has_df_;
+
  private:
   MACE_OP_INPUT_TAGS(INPUT, SHAPE);
   MACE_OP_OUTPUT_TAGS(OUTPUT);
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 9334e850fa214ab710969e7f5e7b3e28f17b303d..236e670f1d26b97471e219ba746102d777a008b5 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -23,6 +23,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bicubic.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -197,9 +198,8 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
-                                                              size[0],
-                                                              size[1]));
+      kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
+          align_corners, size[0], size[1]);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index e4c2f3fc3c64bb08410c709bd2f8b405363dcdd5..46720b3c29d32d01f82902a0bfcc49071aa6aa2a 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/utils/memory.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bilinear.h"
@@ -332,9 +333,8 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
-                                                               size[0],
-                                                               size[1]));
+      kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
+          align_corners, size[0], size[1]);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc
index c40fd46dce86d382df5dec340fbd66cf143f782d..5cdbf07fa101881c4b1c5a4b66476a01199cacee 100644
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -22,6 +22,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_nearest_neighbor.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -142,8 +143,8 @@ class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
     bool align_corners = Operation::GetOptionalArg<bool>(
         "align_corners", false);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ResizeNearestNeighborKernel<T>(
-          align_corners));
+      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
+          align_corners);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/sgemm.cc b/mace/ops/sgemm.cc
deleted file mode 100644
index 1601aac2cd774d9b35406d30dceea56e27469c93..0000000000000000000000000000000000000000
--- a/mace/ops/sgemm.cc
+++ /dev/null
@@ -1,1182 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unistd.h>
-#include <sys/mman.h>
-#include <memory>
-
-#include "mace/ops/sgemm.h"
-#include "mace/core/runtime/cpu/cpu_runtime.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
-#endif
-
-namespace mace {
-namespace ops {
-
-void SGemm::operator()(const SGemmMatrixMap<const float> &lhs,
-                       const SGemmMatrixMap<const float> &rhs,
-                       SGemmMatrixMap<float> *result,
-                       ScratchBuffer *scratch_buffer) {
-  if (lhs.is_const() && !rhs.is_const()) {
-    SGemmMatrixMap<const float> lhs_transpose = lhs.transpose();
-    SGemmMatrixMap<const float> rhs_transpose = rhs.transpose();
-    SGemmMatrixMap<float> result_transpose = result->transpose();
-    return operator()(rhs_transpose,
-                      lhs_transpose,
-                      &result_transpose,
-                      scratch_buffer);
-  }
-
-  if (scratch_buffer != nullptr) {
-    index_t total_size = result->size();
-    if (!lhs.is_const()) {
-      total_size += lhs.size();
-    }
-    if (!rhs.is_const()) {
-      total_size += rhs.size();
-    }
-    scratch_buffer->GrowSize(total_size * sizeof(float));
-
-    if (!lhs.is_const()) {
-      packed_lhs_.reset(new Tensor(scratch_buffer->Scratch(
-          lhs.size() * sizeof(float)), DT_FLOAT));
-    }
-    if (!rhs.is_const()) {
-      packed_rhs_.reset(new Tensor(scratch_buffer->Scratch(
-          rhs.size() * sizeof(float)), DT_FLOAT));
-    }
-    packed_result_.reset(new Tensor(scratch_buffer->Scratch(
-        result->size() * sizeof(float)), DT_FLOAT));
-  }
-
-  if (packed_lhs_.get() == nullptr) {
-    packed_lhs_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT));
-    packed_lhs_->Resize({lhs.size()});
-  }
-  if (packed_rhs_.get() == nullptr) {
-    packed_rhs_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT));
-    packed_rhs_->Resize({rhs.size()});
-  }
-  if (packed_result_.get() == nullptr) {
-    packed_result_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT));
-    packed_result_->Resize({result->size()});
-  }
-
-  if (!lhs.is_const() || !packed_) {
-    PackLhs(lhs, packed_lhs_.get());
-    if (lhs.is_const()) {
-      AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs.data())),
-                 lhs.size() * sizeof(float));
-    }
-  }
-  if (!rhs.is_const() || !packed_) {
-    PackRhs(rhs, packed_rhs_.get());
-    if (rhs.is_const()) {
-      AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs.data())),
-                 rhs.size() * sizeof(float));
-    }
-  }
-  packed_ = true;
-
-  RunInternal(*packed_lhs_,
-              *packed_rhs_,
-              lhs.batch(),
-              lhs.row(),
-              lhs.col(),
-              rhs.col(),
-              packed_result_.get());
-
-  UnPack(*packed_result_, result);
-}
-
-void SGemm::Run(const float *A,
-                const float *B,
-                const index_t batch,
-                const index_t height_a,
-                const index_t width_a,
-                const index_t height_b,
-                const index_t width_b,
-                const bool transpose_a,
-                const bool transpose_b,
-                const bool is_a_weight,
-                const bool is_b_weight,
-                float *C,
-                ScratchBuffer *scratch_buffer) {
-  index_t height_c = height_a;
-  index_t width_c = width_b;
-  if (transpose_a) {
-    height_c = width_a;
-  }
-  if (transpose_b) {
-    width_c = height_b;
-  }
-
-  SGemmMatrixMap<const float> matrix_a =
-      SGemmMatrixMap<const float>(batch,
-                             height_a,
-                             width_a,
-                             ops::SGemmRowMajor,
-                             A,
-                             is_a_weight);
-  SGemmMatrixMap<const float> matrix_b =
-      ops::SGemmMatrixMap<const float>(batch,
-                                  height_b,
-                                  width_b,
-                                  ops::SGemmRowMajor,
-                                  B,
-                                  is_b_weight);
-  if (transpose_a) {
-    matrix_a = matrix_a.transpose();
-  }
-  if (transpose_b) {
-    matrix_b = matrix_b.transpose();
-  }
-  SGemmMatrixMap<float>
-      matrix_c(batch, height_c, width_c, ops::SGemmRowMajor, C);
-  operator()(matrix_a, matrix_b, &matrix_c, scratch_buffer);
-}
-
-#if defined(MACE_ENABLE_NEON)
-#if defined(__aarch64__)
-
-// calculate 8 rows, 4 cols for each depth
-#define MACE_SGEMM_PART_CAL_R8_C4_D1(D, VD, VDN)      \
-  c0 = vfmaq_laneq_f32(c0, b##D, a##VD, 0);           \
-  c1 = vfmaq_laneq_f32(c1, b##D, a##VD, 1);           \
-  c2 = vfmaq_laneq_f32(c2, b##D, a##VD, 2);           \
-  c3 = vfmaq_laneq_f32(c3, b##D, a##VD, 3);           \
-  c4 = vfmaq_laneq_f32(c4, b##D, a##VDN, 0);          \
-  c5 = vfmaq_laneq_f32(c5, b##D, a##VDN, 1);          \
-  c6 = vfmaq_laneq_f32(c6, b##D, a##VDN, 2);          \
-  c7 = vfmaq_laneq_f32(c7, b##D, a##VDN, 3);
-
-// calculate 4 rows, 4 cols for each depth
-#define MACE_SGEMM_PART_CAL_R4_C4_D1(D)               \
-  c0 = vfmaq_laneq_f32(c0, b##D, a##D, 0);            \
-  c1 = vfmaq_laneq_f32(c1, b##D, a##D, 1);            \
-  c2 = vfmaq_laneq_f32(c2, b##D, a##D, 2);            \
-  c3 = vfmaq_laneq_f32(c3, b##D, a##D, 3);
-
-// calculate 4 cols for 8 depths for each row
-#define MACE_SGEMM_PART_CAL_R1_C4_D8(R, VR, VRN)      \
-  c##R = vfmaq_laneq_f32(c##R, b0, a##VR, 0);         \
-  c##R = vfmaq_laneq_f32(c##R, b1, a##VR, 1);         \
-  c##R = vfmaq_laneq_f32(c##R, b2, a##VR, 2);         \
-  c##R = vfmaq_laneq_f32(c##R, b3, a##VR, 3);         \
-  c##R = vfmaq_laneq_f32(c##R, b4, a##VRN, 0);        \
-  c##R = vfmaq_laneq_f32(c##R, b5, a##VRN, 1);        \
-  c##R = vfmaq_laneq_f32(c##R, b6, a##VRN, 2);        \
-  c##R = vfmaq_laneq_f32(c##R, b7, a##VRN, 3);
-
-// calculate 4 cols for 4 depths for each row
-#define MACE_SGEMM_PART_CAL_R1_C4_D4(R)               \
-  c##R = vfmaq_laneq_f32(c##R, b0, a##R, 0);          \
-  c##R = vfmaq_laneq_f32(c##R, b1, a##R, 1);          \
-  c##R = vfmaq_laneq_f32(c##R, b2, a##R, 2);          \
-  c##R = vfmaq_laneq_f32(c##R, b3, a##R, 3);
-
-// calculate 8 cols for 4 depths for each row
-#define MACE_SGEMM_PART_CAL_R1_C8_D4(VR, VRN, R)     \
-  c##VR = vfmaq_laneq_f32(c##VR, b0, a##R, 0);       \
-  c##VR = vfmaq_laneq_f32(c##VR, b2, a##R, 1);       \
-  c##VR = vfmaq_laneq_f32(c##VR, b4, a##R, 2);       \
-  c##VR = vfmaq_laneq_f32(c##VR, b6, a##R, 3);       \
-  c##VRN = vfmaq_laneq_f32(c##VRN, b1, a##R, 0);     \
-  c##VRN = vfmaq_laneq_f32(c##VRN, b3, a##R, 1);     \
-  c##VRN = vfmaq_laneq_f32(c##VRN, b5, a##R, 2);     \
-  c##VRN = vfmaq_laneq_f32(c##VRN, b7, a##R, 3);
-
-#else
-
-#define MACE_SGEMM_PART_CAL_R8_C4_D1(D, VD, VDN)             \
-  c0 = vmlaq_lane_f32(c0, b##D, vget_low_f32(a##VD), 0);     \
-  c1 = vmlaq_lane_f32(c1, b##D, vget_low_f32(a##VD), 1);     \
-  c2 = vmlaq_lane_f32(c2, b##D, vget_high_f32(a##VD), 0);    \
-  c3 = vmlaq_lane_f32(c3, b##D, vget_high_f32(a##VD), 1);    \
-  c4 = vmlaq_lane_f32(c4, b##D, vget_low_f32(a##VDN), 0);    \
-  c5 = vmlaq_lane_f32(c5, b##D, vget_low_f32(a##VDN), 1);    \
-  c6 = vmlaq_lane_f32(c6, b##D, vget_high_f32(a##VDN), 0);   \
-  c7 = vmlaq_lane_f32(c7, b##D, vget_high_f32(a##VDN), 1);
-
-#define MACE_SGEMM_PART_CAL_R4_C4_D1(D)                      \
-  c0 = vmlaq_lane_f32(c0, b##D, vget_low_f32(a##D), 0);      \
-  c1 = vmlaq_lane_f32(c1, b##D, vget_low_f32(a##D), 1);      \
-  c2 = vmlaq_lane_f32(c2, b##D, vget_high_f32(a##D), 0);     \
-  c3 = vmlaq_lane_f32(c3, b##D, vget_high_f32(a##D), 1);
-
-#define MACE_SGEMM_PART_CAL_R1_C4_D8(R, VR, VRN)             \
-  c##R = vmlaq_lane_f32(c##R, b0, vget_low_f32(a##VR), 0);   \
-  c##R = vmlaq_lane_f32(c##R, b1, vget_low_f32(a##VR), 1);   \
-  c##R = vmlaq_lane_f32(c##R, b2, vget_high_f32(a##VR), 0);  \
-  c##R = vmlaq_lane_f32(c##R, b3, vget_high_f32(a##VR), 1);  \
-  c##R = vmlaq_lane_f32(c##R, b4, vget_low_f32(a##VRN), 0);  \
-  c##R = vmlaq_lane_f32(c##R, b5, vget_low_f32(a##VRN), 1);  \
-  c##R = vmlaq_lane_f32(c##R, b6, vget_high_f32(a##VRN), 0); \
-  c##R = vmlaq_lane_f32(c##R, b7, vget_high_f32(a##VRN), 1);
-
-#define MACE_SGEMM_PART_CAL_R1_C4_D4(R)                      \
-  c##R = vmlaq_lane_f32(c##R, b0, vget_low_f32(a##R), 0);    \
-  c##R = vmlaq_lane_f32(c##R, b1, vget_low_f32(a##R), 1);    \
-  c##R = vmlaq_lane_f32(c##R, b2, vget_high_f32(a##R), 0);   \
-  c##R = vmlaq_lane_f32(c##R, b3, vget_high_f32(a##R), 1);
-
-#endif  // __aarch64__
-#endif  // MACE_ENABLE_NEON
-
-void SGemm::RunInternal(const PackedBlock &lhs,
-                        const PackedBlock &rhs,
-                        const index_t batch,
-                        const index_t height,
-                        const index_t depth,
-                        const index_t width,
-                        PackedBlock *result) {
-  const float *lhs_data = lhs.data<float>();
-  const float *rhs_data = rhs.data<float>();
-  float *result_data = result->mutable_data<float>();
-
-#define MACE_SGEMM_RUN_PER_BATCH                      \
-  for (index_t b = 0; b < batch; ++b) {               \
-    RunPerBatch(lhs_data + b * height * depth,        \
-                rhs_data + b * depth * width,         \
-                height,                               \
-                depth,                                \
-                width,                                \
-                result_data + b * height * width);    \
-  }
-
-  if (batch >= MaceOpenMPThreadCount) {
-#pragma omp parallel for schedule(runtime)
-    MACE_SGEMM_RUN_PER_BATCH
-  } else {
-    MACE_SGEMM_RUN_PER_BATCH
-  }
-
-#undef MACE_SGEMM_RUN_PER_BATCH
-}
-
-void SGemm::RunPerBatch(const float *lhs_data,
-                        const float *rhs_data,
-                        const index_t height,
-                        const index_t depth,
-                        const index_t width,
-                        float *result_data) {
-#if defined(MACE_ENABLE_NEON)
-  const index_t block_w = width >> 2;
-  const index_t remain_w = width - (block_w << 2);
-#else
-  const index_t remain_w = width;
-#endif
-
-#if defined(MACE_ENABLE_NEON)
-  // TODO(liyin): make better use l2(l1) cache, try to fit as much lhs data as
-  // as possible to cache, by tiling lhs by height and rhs by width.
-
-  // w: 4
-#pragma omp parallel for schedule(runtime)
-  for (index_t bw = 0; bw < block_w; ++bw) {
-    index_t remain_h = height;
-    index_t block_h = 0;
-
-    const float *lhs_ptr = lhs_data;
-    float *res_ptr = result_data + height * (bw << 2);
-
-#if defined(__aarch64__)
-    block_h = remain_h >> 3;
-    remain_h -= (block_h << 3);
-
-    // h: 8
-    for (index_t bh = 0; bh < block_h; ++bh) {
-      const float *rhs_ptr = rhs_data + depth * (bw << 2);
-
-      index_t remain_d = depth;
-      index_t block_d = remain_d >> 3;
-      remain_d -= (block_d << 3);
-
-      float32x4_t c0, c1, c2, c3, c4, c5, c6, c7;
-      c0 = vdupq_n_f32(0.f);
-      c1 = vdupq_n_f32(0.f);
-      c2 = vdupq_n_f32(0.f);
-      c3 = vdupq_n_f32(0.f);
-      c4 = vdupq_n_f32(0.f);
-      c5 = vdupq_n_f32(0.f);
-      c6 = vdupq_n_f32(0.f);
-      c7 = vdupq_n_f32(0.f);
-
-      // d: 8
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 8.8.4
-        float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13,
-            a14, a15;
-        float32x4_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-        a2 = vld1q_f32(lhs_ptr + 8);
-        a3 = vld1q_f32(lhs_ptr + 12);
-        a4 = vld1q_f32(lhs_ptr + 16);
-        a5 = vld1q_f32(lhs_ptr + 20);
-        a6 = vld1q_f32(lhs_ptr + 24);
-        a7 = vld1q_f32(lhs_ptr + 28);
-        a8 = vld1q_f32(lhs_ptr + 32);
-        a9 = vld1q_f32(lhs_ptr + 36);
-        a10 = vld1q_f32(lhs_ptr + 40);
-        a11 = vld1q_f32(lhs_ptr + 44);
-        a12 = vld1q_f32(lhs_ptr + 48);
-        a13 = vld1q_f32(lhs_ptr + 52);
-        a14 = vld1q_f32(lhs_ptr + 56);
-        a15 = vld1q_f32(lhs_ptr + 60);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-        b4 = vld1q_f32(rhs_ptr + 16);
-        b5 = vld1q_f32(rhs_ptr + 20);
-        b6 = vld1q_f32(rhs_ptr + 24);
-        b7 = vld1q_f32(rhs_ptr + 28);
-
-        MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1);  // d = 1
-        MACE_SGEMM_PART_CAL_R8_C4_D1(1, 2, 3);  // d = 2
-        MACE_SGEMM_PART_CAL_R8_C4_D1(2, 4, 5);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(3, 6, 7);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(4, 8, 9);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(5, 10, 11);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(6, 12, 13);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(7, 14, 15);
-
-        lhs_ptr += 64;
-        rhs_ptr += 32;
-      }
-
-      block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      // d: 4
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 8.4.4
-        float32x4_t a0, a1, a2, a3, a4, a5, a6, a7;
-        float32x4_t b0, b1, b2, b3;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-        a2 = vld1q_f32(lhs_ptr + 8);
-        a3 = vld1q_f32(lhs_ptr + 12);
-        a4 = vld1q_f32(lhs_ptr + 16);
-        a5 = vld1q_f32(lhs_ptr + 20);
-        a6 = vld1q_f32(lhs_ptr + 24);
-        a7 = vld1q_f32(lhs_ptr + 28);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-
-        MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1);  // d = 1
-        MACE_SGEMM_PART_CAL_R8_C4_D1(1, 2, 3);  // d = 2
-        MACE_SGEMM_PART_CAL_R8_C4_D1(2, 4, 5);
-        MACE_SGEMM_PART_CAL_R8_C4_D1(3, 6, 7);
-
-        lhs_ptr += 32;
-        rhs_ptr += 16;
-      }
-
-      // TODO(liyin): handle remain by each case
-      // d: remain
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 8.1.4
-        float32x4_t a0, a1;
-        float32x4_t b0;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-
-        b0 = vld1q_f32(rhs_ptr);
-
-        MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1);  // d = 1
-
-        lhs_ptr += 8;
-        rhs_ptr += 4;
-      }
-
-      vst1q_f32(res_ptr, c0);
-      vst1q_f32(res_ptr + 4, c1);
-      vst1q_f32(res_ptr + 8, c2);
-      vst1q_f32(res_ptr + 12, c3);
-      vst1q_f32(res_ptr + 16, c4);
-      vst1q_f32(res_ptr + 20, c5);
-      vst1q_f32(res_ptr + 24, c6);
-      vst1q_f32(res_ptr + 28, c7);
-
-      res_ptr += 32;
-    }  // bh: 8
-#endif  // __aarch64__
-
-    // h: 4
-    block_h = remain_h >> 2;
-    remain_h -= (block_h << 2);
-
-    for (index_t bh = 0; bh < block_h; ++bh) {
-      const float *rhs_ptr = rhs_data + depth * (bw << 2);
-
-      index_t remain_d = depth;
-      index_t block_d = 0;
-
-      float32x4_t c0, c1, c2, c3;
-      c0 = vdupq_n_f32(0.f);
-      c1 = vdupq_n_f32(0.f);
-      c2 = vdupq_n_f32(0.f);
-      c3 = vdupq_n_f32(0.f);
-
-      // d: 8
-      block_d = remain_d >> 3;
-      remain_d -= (block_d << 3);
-
-#if defined(__aarch64__)
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 4.8.4
-        float32x4_t a0, a1, a2, a3, a4, a5, a6, a7;
-        float32x4_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-        a2 = vld1q_f32(lhs_ptr + 8);
-        a3 = vld1q_f32(lhs_ptr + 12);
-        a4 = vld1q_f32(lhs_ptr + 16);
-        a5 = vld1q_f32(lhs_ptr + 20);
-        a6 = vld1q_f32(lhs_ptr + 24);
-        a7 = vld1q_f32(lhs_ptr + 28);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-        b4 = vld1q_f32(rhs_ptr + 16);
-        b5 = vld1q_f32(rhs_ptr + 20);
-        b6 = vld1q_f32(rhs_ptr + 24);
-        b7 = vld1q_f32(rhs_ptr + 28);
-
-        MACE_SGEMM_PART_CAL_R4_C4_D1(0);  // d = 1
-        MACE_SGEMM_PART_CAL_R4_C4_D1(1);  // d = 2
-        MACE_SGEMM_PART_CAL_R4_C4_D1(2);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(3);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(4);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(5);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(6);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(7);
-
-        lhs_ptr += 32;
-        rhs_ptr += 32;
-      }
-#else  // arm v7
-      // 4.8.4
-      if (block_d > 0) {
-        asm volatile(
-          "0: \n"
-
-          "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
-
-          "vld1.f32 {d20-d21}, [%[rhs_ptr]]! \n"
-          "vld1.f32 {d22-d23}, [%[rhs_ptr]]! \n"
-          "vld1.f32 {d24-d25}, [%[rhs_ptr]]! \n"
-
-          "vmla.f32 %q[c0], q10, d0[0] \n"
-          "vmla.f32 %q[c1], q10, d0[1] \n"
-          "vmla.f32 %q[c2], q10, d1[0] \n"
-          "vmla.f32 %q[c3], q10, d1[1] \n"
-
-          "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d26-d27}, [%[rhs_ptr]]! \n"
-
-          "vmla.f32 %q[c0], q11, d2[0] \n"
-          "vmla.f32 %q[c1], q11, d2[1] \n"
-          "vmla.f32 %q[c2], q11, d3[0] \n"
-          "vmla.f32 %q[c3], q11, d3[1] \n"
-
-          "vld1.f32 {d8-d9}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d28-d29}, [%[rhs_ptr]]! \n"
-
-          "vmla.f32 %q[c0], q12, d4[0] \n"
-          "vmla.f32 %q[c1], q12, d4[1] \n"
-          "vmla.f32 %q[c2], q12, d5[0] \n"
-          "vmla.f32 %q[c3], q12, d5[1] \n"
-
-          "vld1.f32 {d10-d11}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d30-d31}, [%[rhs_ptr]]! \n"
-
-          "vmla.f32 %q[c0], q13, d6[0] \n"
-          "vmla.f32 %q[c1], q13, d6[1] \n"
-          "vmla.f32 %q[c2], q13, d7[0] \n"
-          "vmla.f32 %q[c3], q13, d7[1] \n"
-
-          "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-          "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-
-          "vld1.f32 {d20-d21}, [%[rhs_ptr]]! \n"
-          "vld1.f32 {d22-d23}, [%[rhs_ptr]]! \n"
-
-          "vmla.f32 %q[c0], q14, d8[0] \n"
-          "vmla.f32 %q[c1], q14, d8[1] \n"
-          "vmla.f32 %q[c2], q14, d9[0] \n"
-          "vmla.f32 %q[c3], q14, d9[1] \n"
-
-          "vmla.f32 %q[c0], q15, d10[0] \n"
-          "vmla.f32 %q[c1], q15, d10[1] \n"
-          "vmla.f32 %q[c2], q15, d11[0] \n"
-          "vmla.f32 %q[c3], q15, d11[1] \n"
-
-          "vmla.f32 %q[c0], q10, d0[0] \n"
-          "vmla.f32 %q[c1], q10, d0[1] \n"
-          "vmla.f32 %q[c2], q10, d1[0] \n"
-          "vmla.f32 %q[c3], q10, d1[1] \n"
-
-          "subs %[block_d], %[block_d], #1 \n"
-
-          "vmla.f32 %q[c0], q11, d2[0] \n"
-          "vmla.f32 %q[c1], q11, d2[1] \n"
-          "vmla.f32 %q[c2], q11, d3[0] \n"
-          "vmla.f32 %q[c3], q11, d3[1] \n"
-
-          "bne 0b \n"
-        :  // outputs
-          [lhs_ptr] "+r"(lhs_ptr),
-          [rhs_ptr] "+r"(rhs_ptr),
-          [res_ptr] "+r"(res_ptr),
-          [block_d] "+r"(block_d),
-          [c0] "+w"(c0),
-          [c1] "+w"(c1),
-          [c2] "+w"(c2),
-          [c3] "+w"(c3)
-        :  // inputs
-        :  // clabbers
-        "cc", "memory",
-        "q0", "q1", "q2", "q3", "q4", "q5",
-        "q10", "q11", "q12", "q13", "q14", "q15");
-      }
-#endif  // __aarch64__
-
-      // d: 4
-      block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 4.4.4
-        float32x4_t a0, a1, a2, a3;
-        float32x4_t b0, b1, b2, b3;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-        a2 = vld1q_f32(lhs_ptr + 8);
-        a3 = vld1q_f32(lhs_ptr + 12);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-
-        MACE_SGEMM_PART_CAL_R4_C4_D1(0);  // d = 1
-        MACE_SGEMM_PART_CAL_R4_C4_D1(1);  // d = 2
-        MACE_SGEMM_PART_CAL_R4_C4_D1(2);
-        MACE_SGEMM_PART_CAL_R4_C4_D1(3);
-
-        lhs_ptr += 16;
-        rhs_ptr += 16;
-      }
-
-      // d: remain
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 4.1.4
-        float32x4_t a0;
-        float32x4_t b0;
-
-        a0 = vld1q_f32(lhs_ptr);
-
-        b0 = vld1q_f32(rhs_ptr);
-
-        MACE_SGEMM_PART_CAL_R4_C4_D1(0);  // d = 1
-
-        lhs_ptr += 4;
-        rhs_ptr += 4;
-      }
-      vst1q_f32(res_ptr, c0);
-      vst1q_f32(res_ptr + 4, c1);
-      vst1q_f32(res_ptr + 8, c2);
-      vst1q_f32(res_ptr + 12, c3);
-
-      res_ptr += 16;
-    }  // bh: 4
-
-    // h: 1
-    for (index_t h = 0; h < remain_h; ++h) {
-      const float *rhs_ptr = rhs_data + depth * (bw << 2);
-
-      index_t remain_d = depth;
-      index_t block_d = 0;
-
-      float32x4_t c0 = vdupq_n_f32(0.f);
-
-      // d: 8
-      block_d = remain_d >> 3;
-      remain_d -= (block_d << 3);
-
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 1.8.4
-        float32x4_t a0, a1;
-        float32x4_t b0, b1, b2, b3, b4, b5, b6, b7;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-        b4 = vld1q_f32(rhs_ptr + 16);
-        b5 = vld1q_f32(rhs_ptr + 20);
-        b6 = vld1q_f32(rhs_ptr + 24);
-        b7 = vld1q_f32(rhs_ptr + 28);
-
-        MACE_SGEMM_PART_CAL_R1_C4_D8(0, 0, 1);
-
-        lhs_ptr += 8;
-        rhs_ptr += 32;
-      }
-
-      block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      // d: 4
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 1.4.4
-        float32x4_t a0;
-        float32x4_t b0, b1, b2, b3;
-
-        a0 = vld1q_f32(lhs_ptr);
-
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-        b2 = vld1q_f32(rhs_ptr + 8);
-        b3 = vld1q_f32(rhs_ptr + 12);
-
-        MACE_SGEMM_PART_CAL_R1_C4_D4(0);
-
-        lhs_ptr += 4;
-        rhs_ptr += 16;
-      }
-
-      // d: remain
-      float s0 = 0;
-      float s1 = 0;
-      float s2 = 0;
-      float s3 = 0;
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 1.1.4
-        s0 += lhs_ptr[0] * rhs_ptr[0];
-        s1 += lhs_ptr[0] * rhs_ptr[1];
-        s2 += lhs_ptr[0] * rhs_ptr[2];
-        s3 += lhs_ptr[0] * rhs_ptr[3];
-        lhs_ptr += 1;
-        rhs_ptr += 4;
-      }
-      float32x4_t c0_remain = {s0, s1, s2, s3};
-      c0 += c0_remain;
-
-      vst1q_f32(res_ptr, c0);
-      res_ptr += 4;
-    }  // bh: remain
-  }  // bw
-
-#endif  // MACE_ENABLE_NEON
-
-  // ========================== remain width ===========================
-
-  result_data += (width - remain_w) * height;
-  rhs_data += (width - remain_w) * depth;
-
-  // w: 1
-#pragma omp parallel for schedule(runtime)
-  for (index_t bw = 0; bw < remain_w; ++bw) {
-    index_t remain_h = height;
-
-    const float *lhs_ptr = lhs_data;
-    float *res_ptr = result_data + height * bw;
-
-#if defined(MACE_ENABLE_NEON)
-    index_t block_h = 0;
-#if defined(__aarch64__)
-    block_h = remain_h >> 3;
-    remain_h -= (block_h << 3);
-
-    // h: 8
-    for (index_t bh = 0; bh < block_h; ++bh) {
-      const float *rhs_ptr = rhs_data + depth * bw;
-
-      index_t remain_d = depth;
-
-      float32x4_t c0, c1;
-      c0 = vdupq_n_f32(0.f);
-      c1 = vdupq_n_f32(0.f);
-
-      index_t block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      // d: 4
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 8.4.1
-        float32x4_t b0, b1, b2, b3, b4, b5, b6, b7;
-        float32x4_t a0;
-
-        b0 = vld1q_f32(lhs_ptr);
-        b1 = vld1q_f32(lhs_ptr + 4);
-        b2 = vld1q_f32(lhs_ptr + 8);
-        b3 = vld1q_f32(lhs_ptr + 12);
-        b4 = vld1q_f32(lhs_ptr + 16);
-        b5 = vld1q_f32(lhs_ptr + 20);
-        b6 = vld1q_f32(lhs_ptr + 24);
-        b7 = vld1q_f32(lhs_ptr + 28);
-
-        a0 = vld1q_f32(rhs_ptr);
-
-        MACE_SGEMM_PART_CAL_R1_C8_D4(0, 1, 0);
-
-        lhs_ptr += 32;
-        rhs_ptr += 4;
-      }
-
-      // d: remain
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 8.1.1
-        float32x4_t b0, b1;
-        float32x4_t a0 = vdupq_n_f32(rhs_ptr[0]);
-
-        b0 = vld1q_f32(lhs_ptr);
-        b1 = vld1q_f32(lhs_ptr + 4);
-
-        c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
-        c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
-
-        lhs_ptr += 8;
-        rhs_ptr += 1;
-      }
-
-      vst1q_f32(res_ptr, c0);
-      vst1q_f32(res_ptr + 4, c1);
-
-      res_ptr += 8;
-    }  // bh: 8
-#endif
-
-    // h: 4
-    block_h = remain_h >> 2;
-    remain_h -= (block_h << 2);
-
-    for (index_t bh = 0; bh < block_h; ++bh) {
-      const float *rhs_ptr = rhs_data + depth * bw;
-
-      index_t remain_d = depth;
-      index_t block_d = 0;
-
-      float32x4_t c0 = vdupq_n_f32(0.f);
-
-      block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      // d: 4
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 4.4.1
-        float32x4_t b0, b1, b2, b3;
-        float32x4_t a0;
-
-        b0 = vld1q_f32(lhs_ptr);
-        b1 = vld1q_f32(lhs_ptr + 4);
-        b2 = vld1q_f32(lhs_ptr + 8);
-        b3 = vld1q_f32(lhs_ptr + 12);
-
-        a0 = vld1q_f32(rhs_ptr);
-
-        MACE_SGEMM_PART_CAL_R1_C4_D4(0);
-
-        lhs_ptr += 16;
-        rhs_ptr += 4;
-      }
-
-      // d: remain
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 4.1.1
-        float32x4_t b0;
-        float32x2_t a0 = vdup_n_f32(rhs_ptr[0]);
-
-        b0 = vld1q_f32(lhs_ptr);
-
-        c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-
-        lhs_ptr += 4;
-        rhs_ptr += 1;
-      }
-      vst1q_f32(res_ptr, c0);
-
-      res_ptr += 4;
-    }  // bh: 4
-
-#endif  // MACE_ENABLE_NEON
-
-    // h: 1
-    for (index_t h = 0; h < remain_h; ++h) {
-      const float *rhs_ptr = rhs_data + depth * bw;
-
-      index_t remain_d = depth;
-
-      float sum = 0.f;
-
-#if defined(MACE_ENABLE_NEON)
-      index_t block_d = 0;
-
-      float32x4_t c0, c1;
-      c0 = vdupq_n_f32(0.f);
-      c1 = vdupq_n_f32(0.f);
-
-      block_d = remain_d >> 3;
-      remain_d -= (block_d << 3);
-
-      // d: 8
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 1.8.1
-        float32x4_t a0, a1;
-        float32x4_t b0, b1;
-
-        a0 = vld1q_f32(lhs_ptr);
-        a1 = vld1q_f32(lhs_ptr + 4);
-        b0 = vld1q_f32(rhs_ptr);
-        b1 = vld1q_f32(rhs_ptr + 4);
-
-        c0 = vmlaq_f32(c0, a0, b0);
-        c1 = vmlaq_f32(c1, a1, b1);
-
-        lhs_ptr += 8;
-        rhs_ptr += 8;
-      }
-
-      block_d = remain_d >> 2;
-      remain_d -= (block_d << 2);
-
-      // d: 4
-      for (index_t bd = 0; bd < block_d; ++bd) {
-        // 1.4.1
-        float32x4_t a0;
-        float32x4_t b0;
-
-        a0 = vld1q_f32(lhs_ptr);
-        b0 = vld1q_f32(rhs_ptr);
-
-        c0 = vmlaq_f32(c0, a0, b0);
-
-        lhs_ptr += 4;
-        rhs_ptr += 4;
-      }
-      sum += vaddvq_f32(c0);
-      sum += vaddvq_f32(c1);
-#endif  // MACE_ENABLE_NEON
-
-      // d: remain
-      for (index_t d = 0; d < remain_d; ++d) {
-        // 1.1.1
-        sum += lhs_ptr[0] * rhs_ptr[0];
-        lhs_ptr += 1;
-        rhs_ptr += 1;
-      }
-
-      *res_ptr = sum;
-      ++res_ptr;
-    }  // bh: remain
-  }  // bw
-}
-
-void SGemm::PackLhs(const SGemmMatrixMap<const float> &lhs,
-                    PackedBlock *packed_block) {
-  Pack(lhs, PackOrder::SGemmColMajor, packed_block);
-}
-
-void SGemm::PackRhs(const SGemmMatrixMap<const float> &rhs,
-                    PackedBlock *packed_block) {
-  Pack(rhs, PackOrder::SGemmRowMajor, packed_block);
-}
-
-void SGemm::Pack(const SGemmMatrixMap<const float> &src,
-                 const PackOrder order,
-                 PackedBlock *packed_block) {
-  MACE_CHECK_NOTNULL(packed_block);
-
-  const index_t height = src.row();
-  const index_t width = src.col();
-  auto packed_data = packed_block->mutable_data<float>();
-
-#define MACE_SGEMM_PACK_PER_BATCH                                     \
-    for (index_t b = 0; b < src.batch(); ++b) {                       \
-      PackPerBatch(src, order, b, packed_data + b * height * width);  \
-    }
-  if (src.batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for schedule(runtime)
-    MACE_SGEMM_PACK_PER_BATCH
-  } else {
-    MACE_SGEMM_PACK_PER_BATCH
-  }
-#undef MACE_SGEMM_PACK_PER_BATCH
-}
-
-void SGemm::UnPack(const PackedBlock &packed_result,
-                   SGemmMatrixMap<float> *matrix_map) {
-  MACE_CHECK_NOTNULL(matrix_map);
-
-  const index_t height = matrix_map->row();
-  const index_t width = matrix_map->col();
-  auto packed_data = packed_result.data<float>();
-
-#define MACE_SGEMM_UNPACK_PER_BATCH                                   \
-  for (index_t b = 0; b < matrix_map->batch(); ++b) {                 \
-    UnPackPerBatch(packed_data + b * height * width, b, matrix_map);  \
-  }
-
-  if (matrix_map->batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for schedule(runtime)
-    MACE_SGEMM_UNPACK_PER_BATCH
-  } else {
-    MACE_SGEMM_UNPACK_PER_BATCH
-  }
-#undef MACE_SGEMM_UNPACK_PER_BATCH
-}
-
-void SGemm::PackPerBatch(const SGemmMatrixMap<const float> &src,
-                         const PackOrder order,
-                         const index_t batch_index,
-                         float *packed_data) {
-  MACE_CHECK_NOTNULL(packed_data);
-
-  const index_t height = src.row();
-  const index_t width = src.col();
-  auto src_data = src.batch_data(batch_index);
-
-  if (src.map_major() == Major::SGemmRowMajor
-      && order == PackOrder::SGemmColMajor) {
-    // This is for packing no-transpose lhs.
-    index_t h = 0;
-#if defined(MACE_ENABLE_NEON)
-#if defined(__aarch64__)
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih <= height - 8; ih += 8) {
-      const float *src_data_ptr = src_data + ih * width;
-      float *packed_data_ptr = packed_data + ih * width;
-      for (index_t w = 0; w < width; ++w) {
-        const index_t src_offset = w;
-        const index_t packed_offset = w * 8;
-        float32x4_t vs0 = {src_data_ptr[src_offset],
-                           src_data_ptr[src_offset + width],
-                           src_data_ptr[src_offset + 2 * width],
-                           src_data_ptr[src_offset + 3 * width]};
-        float32x4_t vs1 = {src_data_ptr[src_offset + 4 * width],
-                           src_data_ptr[src_offset + 5 * width],
-                           src_data_ptr[src_offset + 6 * width],
-                           src_data_ptr[src_offset + 7 * width]};
-        vst1q_f32(packed_data_ptr + packed_offset, vs0);
-        vst1q_f32(packed_data_ptr + packed_offset + 4, vs1);
-      }
-    }
-    h += (height - h) / 8 * 8;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih <= height - 4; ih += 4) {
-      const float *src_data_ptr = src_data + ih * width;
-      float *packed_data_ptr = packed_data + ih * width;
-      for (index_t w = 0; w < width; ++w) {
-        const index_t src_offset = w;
-        const index_t packed_offset = w * 4;
-        float32x4_t vs = {src_data_ptr[src_offset],
-                          src_data_ptr[src_offset + width],
-                          src_data_ptr[src_offset + 2 * width],
-                          src_data_ptr[src_offset + 3 * width]};
-        vst1q_f32(packed_data_ptr + packed_offset, vs);
-      }
-    }
-    h += (height - h) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih < height; ++ih) {
-      std::copy_n(src_data + ih * width, width, packed_data + ih * width);
-    }
-  } else if (src.map_major() == Major::SGemmColMajor &&
-      order == PackOrder::SGemmColMajor) {
-    // This is for packing transpose-needed lhs.
-    index_t h = 0;
-#if defined(MACE_ENABLE_NEON)
-#if defined(__aarch64__)
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih <= height - 8; ih += 8) {
-      const float *src_data_ptr = src_data + ih;
-      float *packed_data_ptr = packed_data + ih * width;
-      for (index_t w = 0; w < width; ++w) {
-        const index_t src_offset = w * height;
-        const index_t packed_offset = w * 8;
-        float32x4_t vs0 = vld1q_f32(src_data_ptr + src_offset);
-        float32x4_t vs1 = vld1q_f32(src_data_ptr + src_offset + 4);
-        vst1q_f32(packed_data_ptr + packed_offset, vs0);
-        vst1q_f32(packed_data_ptr + packed_offset + 4, vs1);
-      }
-    }
-    h += (height - h) / 8 * 8;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih <= height - 4; ih += 4) {
-      const float *src_data_ptr = src_data + ih;
-      float *packed_data_ptr = packed_data + ih * width;
-      for (index_t w = 0; w < width; ++w) {
-        const index_t src_offset = w * height;
-        const index_t packed_offset = w * 4;
-        float32x4_t vs = vld1q_f32(src_data_ptr + src_offset);
-        vst1q_f32(packed_data_ptr + packed_offset, vs);
-      }
-    }
-    h += (height - h) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t ih = h; ih < height; ++ih) {
-      const float *src_data_ptr = src_data + ih;
-      float *packed_data_ptr = packed_data + ih * width;
-      for (index_t w = 0; w < width; ++w) {
-        packed_data_ptr[w] = src_data_ptr[w * height];
-      }
-    }
-  } else if (src.map_major() == Major::SGemmRowMajor &&
-      order == PackOrder::SGemmRowMajor) {
-    // This is for packing no-transpose rhs.
-    index_t w = 0;
-#if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw <= width - 4; iw += 4) {
-      const float *src_data_ptr = src_data + iw;
-      float *packed_data_ptr = packed_data + iw * height;
-      for (index_t h = 0; h < height; ++h) {
-        const index_t src_offset = h * width;
-        const index_t packed_offset = h * 4;
-        float32x4_t vs = vld1q_f32(src_data_ptr + src_offset);
-        vst1q_f32(packed_data_ptr + packed_offset, vs);
-      }
-    }
-    w += (width - w) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw < width; ++iw) {
-      const float *src_data_ptr = src_data + iw;
-      float *packed_data_ptr = packed_data + iw * height;
-      for (index_t h = 0; h < height; ++h) {
-        packed_data_ptr[h] = src_data_ptr[h * width];
-      }
-    }
-  } else if (src.map_major() == Major::SGemmColMajor &&
-      order == PackOrder::SGemmRowMajor) {
-    // This is for packing transpose-needed rhs.
-    index_t w = 0;
-#if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw <= width - 4; iw += 4) {
-      const float *src_data_ptr = src_data + iw * height;
-      float *packed_data_ptr = packed_data + iw * height;
-      for (index_t h = 0; h < height; ++h) {
-        const index_t src_offset = h;
-        const index_t packed_offset = h * 4;
-        float32x4_t vs = {src_data_ptr[src_offset],
-                          src_data_ptr[src_offset + height],
-                          src_data_ptr[src_offset + 2 * height],
-                          src_data_ptr[src_offset + 3 * height]};
-        vst1q_f32(packed_data_ptr + packed_offset, vs);
-      }
-    }
-    w += (width - w) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw < width; ++iw) {
-      std::copy_n(src_data + iw * height, height, packed_data + iw * height);
-    }
-  }
-}
-
-void SGemm::UnPackPerBatch(const float *packed_data,
-                           const index_t batch_index,
-                           SGemmMatrixMap<float> *matrix_map) {
-  MACE_CHECK_NOTNULL(matrix_map);
-
-  const index_t height = matrix_map->row();
-  const index_t width = matrix_map->col();
-  auto unpacked_data = matrix_map->batch_data(batch_index);
-
-  if (matrix_map->map_major() == Major::SGemmRowMajor) {
-    // This is for non-transposed result
-    index_t w = 0;
-#if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw <= width - 4; iw += 4) {
-      const float *packed_data_ptr = packed_data + iw * height;
-      float *unpacked_data_ptr = unpacked_data + iw;
-      for (index_t h = 0; h < height; ++h) {
-        const index_t packed_offset = h * 4;
-        const index_t unpacked_offset = h * width;
-        float32x4_t vs = vld1q_f32(packed_data_ptr + packed_offset);
-        vst1q_f32(unpacked_data_ptr + unpacked_offset, vs);
-      }
-    }
-    w += (width - w) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw < width; ++iw) {
-      const float *packed_data_ptr = packed_data + iw * height;
-      float *unpacked_data_ptr = unpacked_data + iw;
-      for (index_t h = 0; h < height; ++h) {
-        unpacked_data_ptr[h * width] = packed_data_ptr[h];
-      }
-    }
-  } else {
-    // This is for transposed result
-    index_t w = 0;
-#if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw <= width - 4; iw += 4) {
-      const float *packed_data_ptr = packed_data + iw * height;
-      float *unpacked_data_ptr = unpacked_data + iw * height;
-      for (index_t h = 0; h < height; ++h) {
-        const index_t packed_offset = h * 4;
-        const index_t unpacked_offset = h;
-        float32x4_t vs = vld1q_f32(packed_data_ptr + packed_offset);
-        unpacked_data_ptr[unpacked_offset] = vs[0];
-        unpacked_data_ptr[unpacked_offset + height] = vs[1];
-        unpacked_data_ptr[unpacked_offset + 2 * height] = vs[2];
-        unpacked_data_ptr[unpacked_offset + 3 * height] = vs[3];
-      }
-    }
-    w += (width - w) / 4 * 4;
-#endif
-#pragma omp parallel for schedule(runtime)
-    for (index_t iw = w; iw < width; ++iw) {
-      std::copy_n(
-          packed_data + iw * height, height, unpacked_data + iw * height);
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/sgemm.h b/mace/ops/sgemm.h
deleted file mode 100644
index 1320d1bef77710f9b9f4d662ed53c213be83d4c2..0000000000000000000000000000000000000000
--- a/mace/ops/sgemm.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This implementation is deprecated. use mace/ops/arm/fp32/gemm.h instead.
-
-#ifndef MACE_OPS_SGEMM_H_
-#define MACE_OPS_SGEMM_H_
-
-#include <memory>
-#include <utility>
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include "mace/core/types.h"
-#include "mace/core/allocator.h"
-#include "mace/core/tensor.h"
-
-namespace mace {
-namespace ops {
-
-enum Major {
-  SGemmRowMajor,
-  SGemmColMajor
-};
-
-template<typename T>
-class SGemmMatrixMap {
- public:
-  SGemmMatrixMap() {}
-
-  SGemmMatrixMap(const index_t batch,
-            const index_t row,
-            const index_t col,
-            const Major major,
-            T *data,
-            const bool is_const = false) :
-      batch_(batch),
-      row_(row),
-      col_(col),
-      stride_(major == SGemmRowMajor ? col : row),
-      major_(major),
-      data_(data),
-      is_const_(is_const) {}
-
-  SGemmMatrixMap transpose() const {
-    Major transpose_major =
-        major_ == SGemmRowMajor ? SGemmColMajor : SGemmRowMajor;
-    return SGemmMatrixMap(batch_,
-                          col_,
-                          row_,
-                          transpose_major,
-                          data_,
-                          is_const_);
-  }
-
-  index_t batch() const {
-    return batch_;
-  }
-
-  index_t row() const {
-    return row_;
-  }
-
-  index_t col() const {
-    return col_;
-  }
-
-  index_t stride() const {
-    return stride_;
-  }
-
-  Major map_major() const {
-    return major_;
-  }
-
-  T *data() const {
-    return data_;
-  }
-
-  T *batch_data(index_t batch) const {
-    return data_ + batch * row_ * col_;
-  }
-
-  index_t size() const {
-    return batch_ * row_ * col_;
-  }
-
-  bool is_const() const {
-    return is_const_;
-  }
-
- private:
-  index_t batch_;
-  index_t row_;
-  index_t col_;
-  index_t stride_;
-  Major major_;
-  T *data_;
-  bool is_const_;
-};
-
-typedef Major PackOrder;
-typedef Tensor PackedBlock;
-
-class SGemm {
- public:
-  SGemm()
-      : packed_lhs_(nullptr),
-        packed_rhs_(nullptr),
-        packed_(false) {}
-
-  void operator()(const SGemmMatrixMap<const float> &lhs,
-                  const SGemmMatrixMap<const float> &rhs,
-                  SGemmMatrixMap<float> *result,
-                  ScratchBuffer *scratch_buffer = nullptr);
-
-  void Run(const float *A,
-           const float *B,
-           const index_t batch,
-           const index_t height_a,
-           const index_t width_a,
-           const index_t height_b,
-           const index_t width_b,
-           const bool transpose_a,
-           const bool transpose_b,
-           const bool is_a_weight,
-           const bool is_b_weight,
-           float *C,
-           ScratchBuffer *scratch_buffer = nullptr);
-
-  void PackLhs(const SGemmMatrixMap<const float> &lhs,
-               PackedBlock *packed_block);
-
-  void PackRhs(const SGemmMatrixMap<const float> &rhs,
-               PackedBlock *packed_block);
-
-  void UnPack(const PackedBlock &packed_result,
-              SGemmMatrixMap<float> *matrix_map);
-
- private:
-  void Pack(const SGemmMatrixMap<const float> &src,
-            const PackOrder order,
-            PackedBlock *packed_block);
-
-  void PackPerBatch(const SGemmMatrixMap<const float> &src,
-                    const PackOrder order,
-                    const index_t batch_index,
-                    float *packed_data);
-
-  void UnPackPerBatch(const float *packed_data,
-                      const index_t batch_index,
-                      SGemmMatrixMap<float> *matrix_map);
-
-  void RunInternal(const PackedBlock &lhs,
-                   const PackedBlock &rhs,
-                   const index_t batch,
-                   const index_t height,
-                   const index_t depth,
-                   const index_t width,
-                   PackedBlock *result);
-
-  void RunPerBatch(const float *lhs,
-                   const float *rhs,
-                   const index_t height,
-                   const index_t depth,
-                   const index_t width,
-                   float *result);
-
-  std::unique_ptr<Tensor> packed_lhs_;
-  std::unique_ptr<Tensor> packed_rhs_;
-  std::unique_ptr<Tensor> packed_result_;
-
-  bool packed_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SGEMM_H_
diff --git a/mace/ops/sgemm_pack_test.cc b/mace/ops/sgemm_pack_test.cc
deleted file mode 100644
index 69766cb9eaf706d31f9e637d93809404108073ba..0000000000000000000000000000000000000000
--- a/mace/ops/sgemm_pack_test.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <random>
-#include <vector>
-
-#include "mace/ops/sgemm.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-namespace {
-void TestPack(const std::vector<float> &data,
-              const std::vector<float> &expected_data,
-              const index_t height,
-              const index_t width,
-              Major src_order,
-              PackOrder pack_order) {
-  SGemm sg;
-  SGemmMatrixMap<const float>
-      src_matrix(1, height, width, src_order, data.data());
-  PackedBlock packed;
-  packed.Resize({height, width});
-  if (pack_order == PackOrder::SGemmColMajor) {
-    sg.PackLhs(src_matrix, &packed);
-  } else {
-    sg.PackRhs(src_matrix, &packed);
-  }
-
-  auto packed_data = packed.data<float>();
-  for (index_t i = 0; i < packed.size(); ++i) {
-    EXPECT_EQ(expected_data[i], packed_data[i]);
-  }
-}
-
-void TestUnPack(const index_t height,
-                const index_t width,
-                Major src_order,
-                PackOrder pack_order) {
-  static auto seed = static_cast<unsigned int>(time(nullptr));
-  const index_t matrix_size = height * width;
-  std::vector<float> data(matrix_size);
-  for (int i = 0; i < matrix_size; ++i) {
-    data[i] = rand_r(&seed);
-  }
-
-  SGemmMatrixMap<const float>
-      src_matrix(1, height, width, src_order, data.data());
-  PackedBlock packed;
-  packed.Resize({height, width});
-  SGemm sg;
-  if (pack_order == PackOrder::SGemmColMajor) {
-    sg.PackLhs(src_matrix, &packed);
-  } else {
-    sg.PackRhs(src_matrix, &packed);
-  }
-
-  std::vector<float> unpacked(matrix_size);
-  SGemmMatrixMap<float>
-      unpacked_matrix(1, height, width, src_order, unpacked.data());
-  sg.UnPack(packed, &unpacked_matrix);
-  auto unpacked_data = unpacked.data();
-  for (index_t i = 0; i < packed.size(); ++i) {
-    EXPECT_EQ(data[i], unpacked_data[i]);
-  }
-}
-}  // namespace
-
-
-TEST(SGemmPackTest, Pack) {
-  std::vector<float> data =
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36};
-
-  // For no-transpose lhs
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-           3, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
-           4, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
-            20},
-           5, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#if defined(__aarch64__)
-  TestPack(data,
-           {1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11,
-            15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36},
-           9, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor);
-#endif
-#endif
-  // For transpose-needed lhs
-  TestPack(data,
-           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
-           3, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-           4, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-  TestPack(data,
-           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
-            20},
-           5, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#if defined(__aarch64__)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
-            22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36},
-           9, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor);
-#endif
-#endif
-  // For no-transpose rhs
-  TestPack(data,
-           {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
-           4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-           4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestPack(data,
-           {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
-            20},
-           4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-#endif
-  // For transpose-needed rhs
-  TestPack(data,
-           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-           4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-#if defined(MACE_ENABLE_NEON)
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
-           4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestPack(data,
-           {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
-            20},
-           4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-#endif
-}
-
-TEST(SGemmPackTest, UnPack) {
-  TestUnPack(4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 100, Major::SGemmRowMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-  TestUnPack(4, 100, Major::SGemmColMajor, PackOrder::SGemmRowMajor);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
-
diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc
index 79d05bdcaf27bd9dc8cc49f14254d5c1316beaa2..dcca202f3229f616a3ce89dddcd008cf998a1a69 100644
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -35,11 +35,10 @@ class ShapeOp : public Operation {
     Tensor::MappingGuard output_guard(output);
     int32_t *output_data = output->mutable_data<int32_t>();
 
-    const int data_format =
-        Operation::GetOptionalArg<int>("data_format", 0);
-    if (input->dim_size() == 4 &&
-        D == DeviceType::CPU &&
-        data_format == DataFormat::NCHW) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df && input->data_format() == DataFormat::NCHW &&
+        input->dim_size() != 4) {
       // transpose NCHW to NHWC for cpu runtime
       output_data[0] = static_cast<int32_t>(input->dim(0));
       output_data[1] = static_cast<int32_t>(input->dim(2));
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f38a2a32a861a2ca20882268bc98d96fca55d6d7
--- /dev/null
+++ b/mace/ops/slice.cc
@@ -0,0 +1,94 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class SliceOp;
+
+template <typename T>
+class SliceOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit SliceOp(OpConstructContext *context)
+      : Operation(context),
+        axes_(Operation::GetRepeatedArgs<int>("axes")),
+        starts_(Operation::GetRepeatedArgs<int>("starts")),
+        ends_(Operation::GetRepeatedArgs<int>("ends")) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    const index_t rank = input->dim_size();
+    MACE_CHECK(rank >= 1)
+      << "The input dim size should >= 1";
+    MACE_CHECK(starts_.size() == 1 && ends_.size() == 1 && axes_.size() == 1,
+               "only support slicing at one axis.");
+    MACE_CHECK(axes_[0] == -1 || axes_[0] == rank - 1,
+               "only support slicing at the last axis.");
+    const index_t input_dim = input->dim(rank - 1);
+    const index_t offset = starts_[0];
+    const index_t output_dim = ends_[0] - starts_[0];
+
+    MACE_CHECK(output_dim >= 0, "output_dim should >= 0");
+    MACE_CHECK(starts_[0] < input_dim
+                   && output_dim <= input_dim
+                   && ends_[0] <= input_dim)
+      << "The starts and ends caused over range error.";
+
+    const index_t  frames =
+        std::accumulate(input->shape().begin(), input->shape().end() - 1, 1,
+                        std::multiplies<index_t>());
+
+    std::vector<index_t> output_shape = input->shape();
+    output_shape[rank - 1] = output_dim;
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+#pragma omp parallel for schedule(runtime)
+    for (index_t i = 0; i < frames; ++i) {
+      const T *input_base =
+          input_data + i * input_dim + offset;
+      T *output_base =
+          output_data + i * output_dim;
+      memcpy(output_base, input_base, output_dim * sizeof(T));
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  std::vector<int> axes_;
+  std::vector<int> starts_;
+  std::vector<int> ends_;
+};
+
+void RegisterSlice(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Slice", SliceOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5f82cc18d52120715b57f4388e4ce77dbb1a7d7
--- /dev/null
+++ b/mace/ops/slice_test.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class SliceOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestSlice(const std::vector<index_t> &input_shape,
+               const std::vector<T> &input,
+               const int offset,
+               const int output_dim,
+               const std::vector<index_t> &output_shape,
+               const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+
+  OpDefBuilder("Slice", "SliceTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("axes", {-1})
+      .AddIntsArg("starts", {offset})
+      .AddIntsArg("ends", {offset + output_dim})
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(SliceOpTest, Simple2Dim) {
+  TestSlice<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    2, 3, {3, 3},
+    {3, 4, 5, 8, 9, 10, 13, 14, 15});
+}
+
+TEST_F(SliceOpTest, Simple3Dim) {
+  TestSlice<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    1, 2, {2, 3, 2},
+    {2, 3, 7, 8, 12, 13, 2, 3, 7, 8, 12, 13});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 54f3e55bbaf07d04026ed28de0ed361bd9ff2061..cbab37adf5ebe9e0a3195483cecc287be5931bd0 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -22,7 +22,7 @@
 
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/fixpoint.h"
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
@@ -30,6 +30,8 @@
 #include "mace/ops/opencl/buffer/softmax.h"
 #endif  // MACE_ENABLE_OPENCL
 
+#include "mace/utils/memory.h"
+
 namespace mace {
 namespace ops {
 
@@ -132,10 +134,10 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
   }
 };
 
+#ifdef MACE_ENABLE_QUANTIZE
 static const int kInputDeltaIntBits = 6;
 static const int kSumExpIntBits = 12;
 
-#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
  public:
@@ -374,10 +376,10 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
   explicit SoftmaxOp(OpConstructContext *context)
       : Operation(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::SoftmaxKernel<T>);
+      kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>();
     } else {
       context->set_output_mem_type(MemoryType::GPU_BUFFER);
-      kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
+      kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>();
     }
   }
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index e1da96664abe010a84bd287cc9b2cd940ed7e736..ece9b6f61dd25e0fe4c6d2f5aff1aeea4ed55302 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/space_to_batch.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -309,7 +310,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
   explicit SpaceToBatchNDOp(OpConstructContext *context)
       : SpaceToBatchOpBase(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
+      kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index fb98de71dd118448d02c64f06fb1a79f9d3a8302..4e40227c5b5857d065195d509bcafe55fbef1c59 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/space_to_depth.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -95,7 +96,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
+      kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf1dfe36b41e8c79675f3d75b0578fa2ce76816e
--- /dev/null
+++ b/mace/ops/splice.cc
@@ -0,0 +1,121 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is for SpliceComponent in Kaldi.
+// It splices a context window of frames together [over time]
+// (copy and append the frame whose time-index in in context_)
+// The context_ values indicate which frame (over time) to splice.
+// if context value is less than the first time-index,
+// copy and append the first frame's dada,
+// when context value is larger than frame's count,
+// copy and append the last frame's data.
+// i.e., give input data: [[1, 2, 3], [4, 5, 6]],
+// with input-dim = 3, frame count = 2, context = [-1, 0, 1]
+// Then, the output should be:
+// [1, 2, 3, 1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6, 4, 5, 6]
+// if const_component_dim_ != 0, const_dim_ will be used to determine which
+// row of "in" we copy the last part of each row of "out" from (this part is
+// not subject to splicing, it's assumed constant for each frame of "input".
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/utils/math.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class SpliceOp;
+
+template <typename T>
+class SpliceOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit SpliceOp(OpConstructContext *context)
+      : Operation(context),
+        context_(Operation::GetRepeatedArgs<int>("context")),
+        const_dim_(
+            Operation::GetOptionalArg<int>("const_component_dim", 0)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    MACE_CHECK(context_.size() > 0)
+      << "The context param should not be empty in Splice Op.";
+
+    Tensor *output = this->Output(0);
+    const std::vector<index_t> &input_shape = input->shape();
+
+    const index_t frames =
+        std::accumulate(input->shape().begin(), input->shape().end() - 1, 1,
+                        std::multiplies<index_t>());
+
+    const index_t rank = input->dim_size();
+    const index_t input_dim = input_shape[rank - 1];
+
+    const index_t num_splice = static_cast<index_t>(context_.size());
+    const index_t dim = input_dim - const_dim_;
+    MACE_CHECK(input_dim > const_dim_,
+               "input dim should be greater than const dim.");
+    const index_t output_dim = dim * num_splice + const_dim_;
+
+    std::vector<index_t> output_shape = input->shape();
+    output_shape[rank - 1] = output_dim;
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (index_t i = 0; i < frames; ++i) {
+        for (index_t c = 0; c < num_splice; ++c) {
+          const index_t offset =
+              Clamp<index_t>(context_[c] + i, 0, frames - 1);
+          T *output_base = output_data + i * output_dim + c * dim;
+          const T *input_base = input_data + offset * input_dim;
+          memcpy(output_base, input_base, dim * sizeof(T));
+        }
+      }
+
+    if (const_dim_ > 0) {
+      const index_t output_offset = output_dim - const_dim_;
+      const index_t input_offset = dim;
+#pragma omp parallel for schedule(runtime)
+      for (index_t i = 0; i < frames; ++i) {
+          index_t offset = i + context_[0] >= 0 ? i + context_[0] : 0;
+          T *output_base = output_data + i * output_dim;
+          const T *input_base = input_data + offset * input_dim;
+          memcpy(output_base + output_offset,
+                 input_base + input_offset,
+                 const_dim_ * sizeof(T));
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  std::vector<int> context_;
+  int const_dim_;
+};
+
+void RegisterSplice(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Splice", SpliceOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/splice_benchmark.cc b/mace/ops/splice_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..253808b8385e1526432cfdc3cd5befd98f70736b
--- /dev/null
+++ b/mace/ops/splice_benchmark.cc
@@ -0,0 +1,92 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template<DeviceType D, typename T>
+void BMSpliceHelper(int iters,
+                    const std::vector<index_t> &input_shape,
+                    const index_t left_context,
+                    const index_t right_context,
+                    const int const_component_dim) {
+  mace::testing::StopTiming();
+
+  // Construct graph
+  OpsTestNet net;
+
+  const int num_splice = left_context + right_context + 1;
+  std::vector<int> contexts(num_splice);
+  for (int i = 0; i < num_splice; ++i) {
+    contexts[i] = left_context + i;
+  }
+  const index_t input_size = std::accumulate(input_shape.begin(),
+                                             input_shape.end(),
+                                             1,
+                                             std::multiplies<index_t>());
+  std::vector<float> input_data(input_size);
+  GenerateRandomRealTypeData(input_shape, &input_data);
+  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
+
+  OpDefBuilder("Splice", "SpliceTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("context", contexts)
+      .AddIntArg("const_component_dim", const_component_dim)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+    net.Sync();
+  }
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+    net.Sync();
+  }
+}
+}  // namespace
+
+#define MACE_BM_SPLICE_MACRO(N, H, W, L, R, C, TYPE, DEVICE)  \
+  static void                                                                \
+      MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE(  \
+          int iters) {                                                       \
+        const int64_t tot = static_cast<int64_t>(iters) * N * H * W;         \
+        mace::testing::MacsProcessed(tot);                                   \
+        mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
+        BMSpliceHelper<DEVICE, TYPE>(iters, {N, H, W}, L, R, C);             \
+      }                                                                      \
+      MACE_BENCHMARK(                                                        \
+          MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE)
+
+#define MACE_BM_SPLICE(N, H, W, L, R, C)                 \
+  MACE_BM_SPLICE_MACRO(N, H, W, L, R, C, float, CPU);
+
+MACE_BM_SPLICE(1, 32, 32, 5, 5, 10);
+MACE_BM_SPLICE(1, 32, 32, 7, 7, 5);
+MACE_BM_SPLICE(1, 32, 32, 3, 3, 20);
+MACE_BM_SPLICE(1, 128, 128, 9, 9, 100);
+MACE_BM_SPLICE(1, 128, 128, 7, 7, 100);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/splice_test.cc b/mace/ops/splice_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60e1652a394d7d1a7b88c0b1f537ec5fc688d613
--- /dev/null
+++ b/mace/ops/splice_test.cc
@@ -0,0 +1,84 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class SpliceOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestSplice(const std::vector<index_t> &input_shape,
+                const std::vector<T> &input,
+                const std::vector<int> &context,
+                const int const_dim,
+                const std::vector<index_t> &output_shape,
+                const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+
+  OpDefBuilder("Splice", "SpliceTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("context", context)
+      .AddIntArg("const_component_dim", const_dim)
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(SpliceOpTest, WithoutConstDim) {
+  TestSplice<DeviceType::CPU, float>(
+    {1, 7, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+    {-2, -1, 0, 1, 2}, 0,
+    {1, 7, 10},
+    {1, 2, 1, 2, 1, 2, 3, 4, 5, 6,
+     1, 2, 1, 2, 3, 4, 5, 6, 7, 8,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+     7, 8, 9, 10, 11, 12, 13, 14, 13, 14,
+     9, 10, 11, 12, 13, 14, 13, 14, 13, 14});
+}
+
+TEST_F(SpliceOpTest, WithConstDim) {
+  TestSplice<DeviceType::CPU, float>(
+    {1, 5, 10},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+    {-2, -1, 0, 1, 2}, 7,
+    {1, 5, 22},
+    {1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10,
+     1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 4, 5, 6, 7, 8, 9, 10,
+     1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10,
+     2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 8, 9, 10, 11,
+     3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12});
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 7fe05be1edf474cc92ee8c049f27e8a265ca7219..7c920d4c115f9650973ab62a2c79d29b677faf83 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/split.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -35,9 +36,9 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
         checked_(false) {}
 
   void Validate() {
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df && this->Input(0)->dim_size() == 4) {
       if (axis_ == 3) axis_ = 1;
       else if (axis_ == 2) axis_ = 3;
       else if (axis_ == 1) axis_ = 2;
@@ -108,7 +109,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::SplitKernel<T>(axis));
+      kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index 45331685059228b32ef92f7abffbc98791d90d0b..17584778a8ae93994530bdbad9f8a53d476b1e18 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -44,6 +44,7 @@ void BMSplitHelper(int iters,
   }
   builder
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
       .Finalize(net.NewOperatorDef());
 
   // Warm-up
diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc
index 726d12e6fae54054d504b1a5a07fb9aa70a4e8e5..b693fd0cd3da81e00c5627a852ef6e1c7b97b4c7 100644
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
@@ -54,7 +54,7 @@ void RandomTest(const int num_outputs, int axis) {
     builder = builder.Output(MakeString("Output", i));
   }
   builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Finalize(net.NewOperatorDef());
 
   // Run
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index 1bd8a2e33e872715f57b712102643b411b142fbb..b937b259322615abcbb929e4c17c0f41e3844167 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -19,6 +19,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/sqrdiff_mean.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
@@ -83,7 +84,7 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
   explicit SqrDiffMeanOp(OpConstructContext *context)
       : Operation(context) {
     if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::SqrDiffMeanKernel<T>());
+      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc
index e67d2672b4df63795cb63bbce9b0e4960d33fa43..15c3408c2bbbfbc6832af699045036d1580152c7 100644
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -32,9 +32,9 @@ class SqueezeOp : public Operation {
     MACE_UNUSED(context);
     if (!checked_ && D == DeviceType::CPU
         && DataTypeToEnum<T>::value != DT_UINT8) {
-      auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-          "data_format", DataFormat::DF_NONE));
-      if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+      auto has_df = Operation::GetOptionalArg<int>(
+          "has_data_format", 0);
+      if (has_df && this->Input(0)->dim_size() == 4) {
         if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) {
           axis_[0] = 2;
           axis_[1] = 3;
diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc
index 3c27f6b9c0ca127726c04599012698a8d4a5d236..8cd829794c16c71b3df1853fedc79eed75d317a8 100644
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
@@ -30,7 +30,7 @@ void TestSqueeze(const std::vector<index_t> &org_shape,
   OpDefBuilder("Squeeze", "SqueezeTest")
       .Input("Input")
       .AddIntsArg("axis", axis)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc
index 221a75d46442afd1b3f385350b6ddd943bdb5db9..c10914f27fb87e7e1159749eb990a66bb6506f42 100644
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/utils/math.h"
 
 namespace mace {
 namespace ops {
@@ -32,21 +33,69 @@ class StridedSliceOp : public Operation {
         new_axis_mask_(Operation::GetOptionalArg<int>("new_axis_mask", 0)),
         shrink_axis_mask_(
             Operation::GetOptionalArg<int>("shrink_axis_mask", 0)),
-        is_slice_(Operation::GetOptionalArg<bool>("slice", false)) {
+        is_slice_(Operation::GetOptionalArg<bool>("slice", false)),
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0)),
+        checked_(false) {
     MACE_CHECK(ellipsis_mask_ == 0 && new_axis_mask_ == 0,
                "ellipsis_mask and new_axis_mask are not supported yet.");
   }
 
+  void TransposeMaskValueFromNHWCToNCHW(int* mask_value) {
+    size_t dims[4];
+    int count;
+    for (count = 0; count < 4; ++count) {
+      dims[count] = *mask_value & 1;
+      *mask_value >>= 1;
+    }
+    size_t new_dims[4] = {dims[0], dims[3], dims[1], dims[2]};
+    for (count = 3; count >= 0; --count) {
+      *mask_value <<= 1;
+      *mask_value += new_dims[count];
+    }
+  }
+
+  void TransposeDimsFromNHWCToNCHW(std::vector<int32_t>* dims) {
+    int32_t h = (*dims)[1];
+    int32_t w = (*dims)[2];
+    int32_t c = (*dims)[3];
+
+    (*dims)[1] = c;
+    (*dims)[2] = h;
+    (*dims)[3] = w;
+  }
+
+  void TransposeDimsFromNCHWToNHWC(std::vector<int32_t>* dims) {
+    int32_t c = (*dims)[1];
+    int32_t h = (*dims)[2];
+    int32_t w = (*dims)[3];
+
+    (*dims)[1] = h;
+    (*dims)[2] = w;
+    (*dims)[3] = c;
+  }
+
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
+
+    if (!checked_) {
+      if (has_data_format_ && this->Input(0)->dim_size() == 4) {
+        TransposeMaskValueFromNHWCToNCHW(&begin_mask_);
+        TransposeMaskValueFromNHWCToNCHW(&end_mask_);
+        TransposeMaskValueFromNHWCToNCHW(&ellipsis_mask_);
+        TransposeMaskValueFromNHWCToNCHW(&new_axis_mask_);
+        TransposeMaskValueFromNHWCToNCHW(&shrink_axis_mask_);
+      }
+      checked_ = true;
+    }
+
     const Tensor *input = this->Input(INPUT);
     const Tensor *begin_indices = this->Input(BEGIN);
     const Tensor *end_indices = this->Input(END);
     const Tensor *strides = nullptr;
+
     if (this->InputSize() > 3) {
       strides = this->Input(STRIDES);
     }
-    Tensor *output = this->Output(OUTPUT);
     if (strides == nullptr) {
       tmp_strides_tensor_.Resize({begin_indices->size()});
       Tensor::MappingGuard strides_guard(&tmp_strides_tensor_);
@@ -55,6 +104,11 @@ class StridedSliceOp : public Operation {
       strides = &tmp_strides_tensor_;
     }
 
+    MACE_CHECK(begin_indices->dim_size() == 1 &&
+               end_indices->dim_size() == 1 &&
+               strides->dim_size() == 1,
+               "Expected begin, end, and strides to be 1D tensor");
+
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard begin_indices_guard(begin_indices);
     Tensor::MappingGuard end_indices_guard(end_indices);
@@ -63,88 +117,145 @@ class StridedSliceOp : public Operation {
     const int32_t *begin_indices_data = begin_indices->data<int32_t>();
     const int32_t *end_indices_data = end_indices->data<int32_t>();
     const int32_t *strides_data = strides->data<int32_t>();
-    std::vector<int32_t> pad_begin_indices(input->dim_size(), 0);
-    std::vector<int32_t> pad_end_indices(input->dim_size(), 0);
-    std::vector<int32_t> pad_strides_indices(input->dim_size(), 1);
-
-    if (begin_indices->size() < input->dim_size()) {
-      for (index_t i = 0; i < begin_indices->size(); ++i) {
-        pad_begin_indices[i] = begin_indices_data[i];
-        pad_end_indices[i] = end_indices_data[i];
-        pad_strides_indices[i] = strides_data[i];
-      }
-      for (index_t i = begin_indices->size(); i < input->dim_size(); ++i) {
-        pad_end_indices[i] = input->dim(i);
-      }
-      begin_indices_data = pad_begin_indices.data();
-      end_indices_data = pad_end_indices.data();
-      strides_data = pad_strides_indices.data();
-    }
 
-    std::vector<int32_t> slice_end_data;
+    std::vector<int32_t> begin_indices_vec(
+        begin_indices_data, begin_indices_data + begin_indices->size());
+    std::vector<int32_t> end_indices_vec(
+        end_indices_data, end_indices_data + end_indices->size());
+    std::vector<int32_t> strides_indices_vec(
+        strides_data, strides_data + strides->size());
+
+    MACE_CHECK(input->size() > 0 && input->dim_size() > 0 &&
+               input->dim_size() <= 4,
+               "The input size should larger than 0."
+               " And input dims should be an integer in (0, 4].");
+
+    std::vector<index_t> output_shape = {};
+
+    const size_t input_dims = input->dim_size();
     if (is_slice_) {
-      // if this op is slice, the end_indices_data is size actually
-      slice_end_data.resize(end_indices->size());
-      for (size_t i = 0; i < slice_end_data.size(); ++i) {
-        if (end_indices_data[i] == -1) {
-          slice_end_data[i] = input->dim(i);
-        } else {
-          slice_end_data[i] = begin_indices_data[i] + end_indices_data[i];
+      MACE_CHECK(begin_indices_vec.size() == input_dims &&
+                 end_indices_vec.size() == input_dims,
+                 "In slice, begin and size elements num should be equal");
+
+      // transpose
+      if (has_data_format_ && this->Input(0)->dim_size() == 4) {
+        TransposeDimsFromNHWCToNCHW(&begin_indices_vec);
+        TransposeDimsFromNHWCToNCHW(&end_indices_vec);
+        TransposeDimsFromNHWCToNCHW(&strides_indices_vec);
+      }
+
+      for (size_t i = 0; i < input_dims; ++i) {
+        if (end_indices_vec[i] == -1) {
+          end_indices_vec[i] = input->dim(i) - begin_indices_vec[i];
         }
       }
-      end_indices_data = slice_end_data.data();
-    }
 
-    std::vector<index_t> output_shape;
-    std::vector<index_t> real_begin_indices(input->dim_size(), 0);
-    std::vector<index_t> real_end_indices(input->dim_size(), 0);
-    for (index_t d = 0; d < input->dim_size(); ++d) {
-      index_t dim_len = input->dim(d);
-      if (begin_mask_ & (1 << d)) {
-        real_begin_indices[d] = strides_data[d] > 0 ? 0 : dim_len - 1;
-      } else {
-        real_begin_indices[d] = (begin_indices_data[d] + dim_len) % dim_len;
+      for (size_t i = 0; i < input_dims; ++i) {
+        int32_t b = begin_indices_vec[i];
+        int32_t s = end_indices_vec[i];
+        int32_t input_i = input->dim(i);
+        MACE_CHECK(0 <= b && b <= input_i,
+                   "In Slice, expected begin[", i, "] in [0, ", input_i,
+                   "], but got ", b);
+        MACE_CHECK(0 <= s && b + s <= input_i,
+                   "In Slice, expected size[", i, "] in [0, ",
+                   input_i - b, "], but got", s);
+        end_indices_vec[i] = b + s;
+        output_shape.push_back(s);
       }
-      if (end_mask_ & (1 << d)) {
-        real_end_indices[d] = strides_data[d] > 0 ? dim_len : -1;
-      } else {
-        real_end_indices[d] =
-            end_indices_data[d] < -dim_len
-            ? -1
-            : (end_indices_data[d] < 0
-               ? (end_indices_data[d] + dim_len)
-               : std::min(static_cast<index_t>(end_indices_data[d]),
-                          dim_len));
+    } else {
+      MACE_CHECK(begin_indices_vec.size() == end_indices_vec.size() &&
+                 end_indices_vec.size() == strides_indices_vec.size(),
+                 "In strided_slice, expected begin, end, and strides to be",
+                 " equal size tensors");
+      for (index_t i = 0; i < strides->size(); ++i) {
+        MACE_CHECK(strides_indices_vec[i] != 0, "strides data cannot be 0!");
       }
 
-      int32_t out_dim_len = std::max(
-          0.f, std::ceil((real_end_indices[d] - real_begin_indices[d]) /
-              static_cast<float>(strides_data[d])));
-      if (!(shrink_axis_mask_ & (1 << d))) {
-        output_shape.push_back(out_dim_len);
-      } else {
-        MACE_CHECK(out_dim_len == 1,
-                   "cannot shrink axis that has len > 1, dim(", d, "): [",
-                   real_begin_indices[d], ", ", real_end_indices[d], "]");
+      // pad
+      begin_indices_vec.resize(input_dims, 0);
+      strides_indices_vec.resize(input_dims, 1);
+      std::vector<int32_t> tmp_input_dims(input->shape().begin(),
+                                          input->shape().end());
+      if (has_data_format_ && input_dims == 4) {
+        TransposeDimsFromNCHWToNHWC(&tmp_input_dims);
+      }
+      for (size_t i = end_indices_vec.size(); i < input_dims; ++i) {
+        end_indices_vec.push_back(tmp_input_dims[i]);
+      }
+
+      // transpose
+      if (has_data_format_ && this->Input(0)->dim_size() == 4) {
+        TransposeDimsFromNHWCToNCHW(&begin_indices_vec);
+        TransposeDimsFromNHWCToNCHW(&end_indices_vec);
+        TransposeDimsFromNHWCToNCHW(&strides_indices_vec);
+      }
+
+      // mask and shrink
+      for (index_t d = 0; d < input->dim_size(); ++d) {
+        index_t dim_len = input->dim(d);
+        const std::vector<index_t> valid_range = {
+            strides_indices_vec[d] > 0 ? 0 : -1,
+            strides_indices_vec[d] > 0 ? dim_len : dim_len - 1};
+
+        auto format_indices = [valid_range, d, dim_len](index_t indice) {
+          index_t forward = indice < 0 ? indice + dim_len : indice;
+          return Clamp(forward, valid_range[0], valid_range[1]);
+        };
+
+        if (!(shrink_axis_mask_ & (1 << d))) {
+          if (begin_mask_ & (1 << d)) {
+            begin_indices_vec[d] = strides_indices_vec[d] > 0 ? 0 : dim_len - 1;
+          } else {
+            begin_indices_vec[d] = format_indices(begin_indices_vec[d]);
+          }
+          if (end_mask_ & (1 << d)) {
+            end_indices_vec[d] = strides_indices_vec[d] > 0 ? dim_len : -1;
+          } else {
+            end_indices_vec[d] = format_indices(end_indices_vec[d]);
+          }
+
+          int32_t out_dim_len = std::max(
+              0.f, std::ceil((end_indices_vec[d] - begin_indices_vec[d]) /
+                  static_cast<float>(strides_indices_vec[d])));
+          output_shape.push_back(out_dim_len);
+        } else {
+          begin_indices_vec[d] = begin_indices_vec[d] < 0
+                                      ? begin_indices_vec[d] + dim_len
+                                      : begin_indices_vec[d];
+          end_indices_vec[d] = begin_indices_vec[d] + 1;
+          MACE_CHECK(
+              begin_indices_vec[d] >= 0 && begin_indices_vec[d] < dim_len,
+              "slice begin indice of dimension '", d, "': ",
+              begin_indices_vec[d], ", is out of bound");
+        }
       }
     }
 
+    for (size_t i = 0; i < output_shape.size(); ++i) {
+      MACE_CHECK(output_shape[i] > 0,
+                 "Expected output_shape[", i, "] larger than 0, but got ",
+                 output_shape[i]);
+    }
+
     std::vector<index_t> dim_stride(input->dim_size(), 1);
     for (index_t d = input->dim_size() - 2; d >= 0; --d) {
       dim_stride[d] = dim_stride[d + 1] * input->dim(d + 1);
     }
 
+    Tensor *output = this->Output(OUTPUT);
     MACE_RETURN_IF_ERROR(output->Resize(output_shape));
     Tensor::MappingGuard output_guard(output);
     T *output_data = output->mutable_data<T>();
 
     bool slice_by_first_axis = true;
-    if (strides_data[0] != 1) {
+    if (strides_indices_vec[0] != 1) {
       slice_by_first_axis = false;
     } else {
       for (index_t d = 1; d < input->dim_size(); ++d) {
-        if (strides_data[d] != 1 || real_begin_indices[d] != 0 ||
-            real_end_indices[d] != input->dim(d)) {
+        if (strides_indices_vec[d] != 1 || begin_indices_vec[d] != 0 ||
+            end_indices_vec[d] != input->dim(d)) {
           slice_by_first_axis = false;
           break;
         }
@@ -152,47 +263,71 @@ class StridedSliceOp : public Operation {
     }
 
     if (slice_by_first_axis) {
-      memcpy(output_data, input_data + real_begin_indices[0] * dim_stride[0],
-             sizeof(T) * (real_end_indices[0] - real_begin_indices[0]) *
+      memcpy(output_data, input_data + begin_indices_vec[0] * dim_stride[0],
+             sizeof(T) * (end_indices_vec[0] - begin_indices_vec[0]) *
                  dim_stride[0]);
     } else {
       if (input->dim_size() == 1) {
-        for (index_t i = real_begin_indices[0];
-             strides_data[0] > 0 ? i < real_end_indices[0]
-                                 : i > real_end_indices[0];
-             i += strides_data[0]) {
+        for (index_t i = begin_indices_vec[0];
+             strides_indices_vec[0] > 0 ? i < end_indices_vec[0]
+                                 : i > end_indices_vec[0];
+             i += strides_indices_vec[0]) {
           *output_data++ = input_data[i];
         }
       } else if (input->dim_size() == 2) {
-        for (index_t i = real_begin_indices[0];
-             strides_data[0] > 0 ? i < real_end_indices[0]
-                                 : i > real_end_indices[0];
-             i += strides_data[0]) {
-          for (index_t j = real_begin_indices[1];
-               strides_data[1] > 0 ? j < real_end_indices[1]
-                                   : j > real_end_indices[1];
-               j += strides_data[1]) {
+        for (index_t i = begin_indices_vec[0];
+             strides_indices_vec[0] > 0 ? i < end_indices_vec[0]
+                                 : i > end_indices_vec[0];
+             i += strides_indices_vec[0]) {
+          for (index_t j = begin_indices_vec[1];
+               strides_indices_vec[1] > 0 ? j < end_indices_vec[1]
+                                   : j > end_indices_vec[1];
+               j += strides_indices_vec[1]) {
             *output_data++ = input_data[i * input->dim(1) + j];
           }
         }
       } else if (input->dim_size() == 3) {
-        for (index_t i = real_begin_indices[0];
-             strides_data[0] > 0 ? i < real_end_indices[0]
-                                 : i > real_end_indices[0];
-             i += strides_data[0]) {
-          for (index_t j = real_begin_indices[1];
-               strides_data[1] > 0 ? j < real_end_indices[1]
-                                   : j > real_end_indices[1];
-               j += strides_data[1]) {
-            for (index_t k = real_begin_indices[2];
-                 strides_data[2] > 0 ? k < real_end_indices[2]
-                                     : k > real_end_indices[2];
-                 k += strides_data[2]) {
+        for (index_t i = begin_indices_vec[0];
+             strides_indices_vec[0] > 0 ? i < end_indices_vec[0]
+                                 : i > end_indices_vec[0];
+             i += strides_indices_vec[0]) {
+          for (index_t j = begin_indices_vec[1];
+               strides_indices_vec[1] > 0 ? j < end_indices_vec[1]
+                                   : j > end_indices_vec[1];
+               j += strides_indices_vec[1]) {
+            for (index_t k = begin_indices_vec[2];
+                 strides_indices_vec[2] > 0 ? k < end_indices_vec[2]
+                                     : k > end_indices_vec[2];
+                 k += strides_indices_vec[2]) {
               *output_data++ =
                   input_data[(i * input->dim(1) + j) * input->dim(2) + k];
             }
           }
         }
+      } else if (input->dim_size() == 4) {
+        for (index_t i = begin_indices_vec[0];
+             strides_indices_vec[0] > 0 ? i < end_indices_vec[0]
+                                 : i > end_indices_vec[0];
+             i += strides_indices_vec[0]) {
+          for (index_t j = begin_indices_vec[1];
+               strides_indices_vec[1] > 0 ? j < end_indices_vec[1]
+                                   : j > end_indices_vec[1];
+               j += strides_indices_vec[1]) {
+            for (index_t k = begin_indices_vec[2];
+                 strides_indices_vec[2] > 0 ? k < end_indices_vec[2]
+                                     : k > end_indices_vec[2];
+                 k += strides_indices_vec[2]) {
+              for (index_t l = begin_indices_vec[3];
+                   strides_indices_vec[3] > 0 ? l < end_indices_vec[3]
+                                       : l > end_indices_vec[3];
+                   l += strides_indices_vec[3]) {
+                *output_data++ =
+                    input_data[((i * input->dim(1) + j) * input->dim(2) + k)
+                               * input->dim(3) + l];
+              }
+            }
+          }
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -207,6 +342,8 @@ class StridedSliceOp : public Operation {
   int new_axis_mask_;
   int shrink_axis_mask_;
   bool is_slice_;
+  int has_data_format_;
+  bool checked_;
   Tensor tmp_strides_tensor_;
 
   MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES);
diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc
index df691ce682f2a0a55db2f93a9077a265f61cbef0..8b085fe532694f7c343e0cfda735d91332aea294 100644
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -64,6 +64,54 @@ void TestStridedSlice(const std::vector<index_t> &input_shape,
                           *net.GetOutput("Output"));
 }
 
+void TestStridedSliceWithDataFormat(const std::vector<index_t> &input_shape,
+                                    const std::vector<float> &input,
+                                    const std::vector<int32_t> &begin_indices,
+                                    const std::vector<int32_t> &end_indices,
+                                    const std::vector<int32_t> &strides,
+                                    const int begin_mask,
+                                    const int end_mask,
+                                    const int ellipsis_mask,
+                                    const int new_axis_mask,
+                                    const int shrink_axis_mask,
+                                    const std::vector<index_t> &output_shape,
+                                    const std::vector<float> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, float>("Input", input_shape, input);
+  net.AddInputFromArray<CPU, int32_t>(
+      "BeginIndices", {static_cast<int32_t>(begin_indices.size())},
+      begin_indices);
+  net.AddInputFromArray<CPU, int32_t>(
+      "EndIndices", {static_cast<int32_t>(end_indices.size())}, end_indices);
+  net.AddInputFromArray<CPU, int32_t>(
+      "Strides", {static_cast<int32_t>(strides.size())}, strides);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
+      .Input("InputNCHW")
+      .Input("BeginIndices")
+      .Input("EndIndices")
+      .Input("Strides")
+      .Output("OutputNCHW")
+      .AddIntArg("begin_mask", begin_mask)
+      .AddIntArg("end_mask", end_mask)
+      .AddIntArg("ellipsis_mask", ellipsis_mask)
+      .AddIntArg("new_axis_mask", new_axis_mask)
+      .AddIntArg("shrink_axis_mask", shrink_axis_mask)
+      .AddIntArg("has_data_format", 1)
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+  net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
+                          *net.GetOutput("Output"));
+}
+
 void TestSlice(const std::vector<index_t> &input_shape,
                const std::vector<float> &input,
                const std::vector<int32_t> &begin_indices,
@@ -92,6 +140,41 @@ void TestSlice(const std::vector<index_t> &input_shape,
                           *net.GetOutput("Output"));
 }
 
+void TestSliceWithDataFormat(const std::vector<index_t> &input_shape,
+                             const std::vector<float> &input,
+                             const std::vector<int32_t> &begin_indices,
+                             const std::vector<int32_t> &indices_size,
+                             const std::vector<index_t> &output_shape,
+                             const std::vector<float> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, float>("Input", input_shape, input);
+  net.AddInputFromArray<CPU, int32_t>(
+      "BeginIndices", {static_cast<int32_t>(input_shape.size())},
+      begin_indices);
+  net.AddInputFromArray<CPU, int32_t>(
+      "IndicesSize", {static_cast<int32_t>(indices_size.size())}, indices_size);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
+      .Input("InputNCHW")
+      .Input("BeginIndices")
+      .Input("IndicesSize")
+      .Output("OutputNCHW")
+      .AddIntArg("slice", 1)
+      .AddIntArg("has_data_format", 1)
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+  net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
+                          *net.GetOutput("Output"));
+}
+
 }  // namespace
 
 TEST_F(StridedSliceOpTest, TestStridedSliceByFirstAxis) {
@@ -157,6 +240,66 @@ TEST_F(StridedSliceOpTest, TestStridedSliceRank3) {
                    1, 2}, {1, 1, 3, 3});
 }
 
+
+TEST_F(StridedSliceOpTest, TestStridedSliceRank4) {
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 0, 0, 0, 0, {1, 2, 1, 2},
+                   {15, 16, 21, 22});
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 3, 0, 0, 0, 0, {2, 2, 1, 2},
+                   {3, 4, 9, 10, 15, 16, 21, 22});
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 0, {1, 2, 1, 3},
+                   {15, 16, 17, 21, 22, 23});
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 8, {1, 2, 1},
+                   {15, 21});
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 15, {}, {15});
+  TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {-1, 2, 1, 3},
+                   {0, 0, 0, 0}, {-1, -1, -1, -1}, 0, 0, 0, 0, 0, {1, 1, 1, 2},
+                   {23, 22});
+}
+
+TEST_F(StridedSliceOpTest, TestStridedSliceWithDataFormat) {
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 0, 0, 0, 0, {1, 2, 1, 2},
+                   {15, 16, 21, 22});
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 3, 0, 0, 0, 0, {2, 2, 1, 2},
+                   {3, 4, 9, 10, 15, 16, 21, 22});
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0},
+                   {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 0, {1, 2, 1, 3},
+                   {15, 16, 17, 21, 22, 23});
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0},
+                   {2, 1}, {1, 1}, 0, 8, 0, 0, 0, {1, 1, 2, 3},
+                   {12, 13, 14, 15, 16, 17});
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0},
+                   {2, 1}, {1, 1}, 0, 2, 0, 0, 0, {1, 2, 2, 3},
+                   {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  TestStridedSliceWithDataFormat(
+                   {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {-1, 2, 1, 3},
+                   {0, 0, 0, 0}, {-1, -1, -1, -1}, 0, 0, 0, 0, 0, {1, 1, 1, 2},
+                   {23, 22});
+}
+
 TEST_F(StridedSliceOpTest, TestSlice) {
   TestSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {2, 3},
             {1, 2, 3, 4, 5, 6});
@@ -166,6 +309,17 @@ TEST_F(StridedSliceOpTest, TestSlice) {
   TestSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 1}, {2, -1}, {2, 2}, {2, 3, 5, 6});
 }
 
+TEST_F(StridedSliceOpTest, TestSliceWithDataFormat) {
+  TestSliceWithDataFormat({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                          12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                          {1, 0, 1, 0}, {1, 2, 1, 2}, {1, 2, 1, 2},
+                          {15, 16, 21, 22});
+  TestSliceWithDataFormat({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                          12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                          {1, 0, 1, 0}, {-1, -1, -1, -1}, {1, 2, 1, 3},
+                          {15, 16, 17, 21, 22, 23});
+}
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21c83b68f98b791a9a061fb1226b6b86edfceba6
--- /dev/null
+++ b/mace/ops/sum_group.cc
@@ -0,0 +1,107 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is for SumGroupComponent in Kaldi.
+// It's used to sum up groups of posteriors,
+// and to introduce a kind of Gaussian-mixture-model-like
+// idea into neural nets.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class SumGroupOp;
+
+template <typename T>
+class SumGroupOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit SumGroupOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(this->InputSize() >= 2,
+               "SumGroup should have at least 2 inputs.");
+    const Tensor *input = this->Input(0);
+    // Sizes-input gets a vector saying, for
+    // each output-dim, how many
+    // inputs data were summed over.
+    const Tensor *sizes = this->Input(1);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() >= 1,
+               "SumGroup's input's rank should be >= 1.");
+    MACE_CHECK(sizes->dim_size() == 1,
+               "SumGroup's sizes input should be a vector.");
+
+    const std::vector<index_t> &input_shape = input->shape();
+    const index_t bh =
+        std::accumulate(input_shape.begin(), input_shape.end() - 1, 1,
+                        std::multiplies<index_t>());
+    std::vector<index_t> output_shape(input_shape);
+    const index_t output_dim = sizes->dim(0);
+    const index_t dim_size = input->dim_size();
+    const index_t input_dim = input_shape[dim_size -1];
+    output_shape[dim_size - 1] = output_dim;
+
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    Tensor::MappingGuard guard_input(input);
+    Tensor::MappingGuard guard_sizes(sizes);
+    Tensor::MappingGuard guard_output(output);
+    const T *input_data = input->data<T>();
+    const int *sizes_data = sizes->data<int>();
+    T *output_data = output->mutable_data<T>();
+
+    std::vector<std::pair<int, int>>
+        sum_indexes(static_cast<size_t >(output_dim));
+
+    int cur_index = 0;
+    for (index_t i = 0; i < output_dim; ++i) {
+      int size_value = sizes_data[i];
+      MACE_CHECK(size_value > 0, "size value should be > 0");
+      sum_indexes[i].first = cur_index;
+      cur_index += size_value;
+      sum_indexes[i].second = cur_index;
+      MACE_CHECK(cur_index <= input_dim)
+        << "size value over-ranged:" << cur_index << "<=" << input_dim;
+    }
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+    for (index_t i = 0; i < bh; ++i) {
+      for (index_t j = 0; j < output_dim; ++j) {
+        int start_col = sum_indexes[j].first;
+        int end_col = sum_indexes[j].second;
+        T sum = 0;
+        for (int src_col = start_col; src_col < end_col; ++src_col) {
+          sum += input_data[i * input_dim + src_col];
+        }
+        output_data[i * output_dim + j] = sum;
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+void RegisterSumGroup(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/sum_group_benchmark.cc b/mace/ops/sum_group_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3b20e855b23a0babd9d31cee840425cce9545c
--- /dev/null
+++ b/mace/ops/sum_group_benchmark.cc
@@ -0,0 +1,75 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void SumGroupBenchmark(int iters, int n, int h, int w) {
+  mace::testing::StopTiming();
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {n, h, w});
+  net.AddRepeatedInput<D, int>("Sizes",
+                               {w / 2},
+                               2);
+  OpDefBuilder("SumGroup", "SumGroupBM")
+      .Input("Input")
+      .Input("Sizes")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+    net.Sync();
+  }
+}
+}  // namespace
+
+#define MACE_BM_SUMGROUP_MACRO(N, H, W, TYPE, DEVICE)            \
+  static void                                                    \
+      MACE_BM_SUMGROUP_##N##_##H##_##W##_##TYPE##_##DEVICE(      \
+          int iters) {                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));          \
+    SumGroupBenchmark<DEVICE, TYPE>(iters, N, H, W);             \
+  }                                                              \
+  MACE_BENCHMARK(                                                \
+      MACE_BM_SUMGROUP_##N##_##H##_##W##_##TYPE##_##DEVICE)
+
+#define MACE_BM_SUMGROUP(N, H, W)             \
+  MACE_BM_SUMGROUP_MACRO(N, H, W, float, CPU);
+
+MACE_BM_SUMGROUP(1, 10, 256);
+MACE_BM_SUMGROUP(1, 20, 128);
+MACE_BM_SUMGROUP(1, 10, 128);
+MACE_BM_SUMGROUP(1, 20, 512);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/sum_group_test.cc b/mace/ops/sum_group_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5a4ef904871e38688fe18d699955379aeeaf539
--- /dev/null
+++ b/mace/ops/sum_group_test.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class SumGroupOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestSumGroup(const std::vector<index_t> &input_shape,
+                  const std::vector<T> &input,
+                  const std::vector<int> &sizes,
+                  const std::vector<index_t> &output_shape,
+                  const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+  const index_t output_dim = sizes.size();
+  net.AddInputFromArray<CPU, int>(MakeString("Sizes"),
+                                  {output_dim},
+                                  sizes);
+
+  OpDefBuilder("SumGroup", "SumGroupTest")
+      .Input("Input")
+      .Input("Sizes")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", output_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(SumGroupOpTest, SimpleTest) {
+  TestSumGroup<DeviceType::CPU, float>(
+    {1, 5, 10},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+    {2, 1, 2, 3, 2},
+    {1, 5, 5},
+    {3, 3, 9, 21, 19,
+     5, 4, 11, 24, 21,
+     7, 5, 13, 27, 23,
+     9, 6, 15, 30, 25,
+     11, 7, 17, 33, 27});
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b769fe712c35cc39cf282731f2a5d64d21d8695
--- /dev/null
+++ b/mace/ops/target_rms_norm.cc
@@ -0,0 +1,116 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This op is implemented for kaldi's NormalizeComponent.
+// The output y_i = scale * x_i,
+// and we want the RMS value of the y_i equals to target_rms,
+// so y^t y = Dim * target_rms^2 (if y is one row of the input).
+// Dim is the length of a row.
+// we need the scale = 1.0 / sqrt(x^t x / (Dim * target_rms^2)).
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class TargetRMSNormOp;
+
+template <typename T>
+class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit TargetRMSNormOp(OpConstructContext *context)
+      : Operation(context),
+        target_rms_(Operation::GetOptionalArg<float>("target_rms", 1.0)) {}
+
+  // Calculate the square sum of an array
+  float SquareSum(const float *data, const index_t data_len) {
+    const int num_parts = 4;
+    float result = 0.0f;
+    if (data_len <= 2 * num_parts) {
+      for (index_t i = 0; i < data_len; ++i) {
+        result += data[i] * data[i];
+      }
+    } else {
+      const index_t part_len = data_len / num_parts;
+      const index_t left_len = data_len % num_parts;
+      float results[4] = {0.f, 0.f, 0.f, 0.f};
+      for (index_t i = 0; i < num_parts; ++i) {
+        for (index_t j = 0; j < part_len; ++j) {
+          results[i] += data[i * part_len + j] * data[i * part_len + j];
+        }
+      }
+      for (index_t k = 0; k < left_len; ++k) {
+        float d = data[num_parts * part_len + k];
+        results[3] += d * d;
+      }
+
+      for (index_t i = 0; i < num_parts; ++i) {
+        result += results[i];
+      }
+    }
+
+    return result;
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    const std::vector<index_t> &input_shape = input->shape();
+    const index_t dim_size = input->dim_size();
+    MACE_CHECK(dim_size >= 1,
+               "TargetRMSNorm's input dim size should be >= 1.");
+    const index_t dim = input_shape[dim_size -1];
+    MACE_CHECK(dim > 0 && target_rms_ > 0,
+               "Both input dim and target rms should be greater than zero.");
+    const index_t bh =
+        std::accumulate(input_shape.begin(), input_shape.end() - 1, 1,
+                        std::multiplies<index_t>());
+    const float d_scale = dim * target_rms_ * target_rms_;
+
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    Tensor::MappingGuard guard_input(input);
+    Tensor::MappingGuard guard_output(output);
+
+    const float *input_data = input->data<float>();
+    float *output_data = output->mutable_data<float>();
+
+#pragma omp parallel for schedule(runtime)
+    for (index_t i = 0; i < bh; ++i) {
+      float scale = SquareSum(input_data + i * dim, dim);
+      scale = static_cast<float>(1.0 / std::sqrt(scale / d_scale));
+      for (index_t j = 0; j < dim; ++j) {
+        output_data[i * dim + j] = input_data[i * dim + j] * scale;
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  float target_rms_;
+};
+
+void RegisterTargetRMSNorm(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/target_rms_norm_benchmark.cc b/mace/ops/target_rms_norm_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d496bb8b101bc8f083e077bc0c8754c1cf932b0f
--- /dev/null
+++ b/mace/ops/target_rms_norm_benchmark.cc
@@ -0,0 +1,74 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void TargetRMSNormBenchmark(int iters, int n, int h, int w, float target_rms) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {n, h, w});
+
+  OpDefBuilder("TargetRMSNorm", "TargetRMSNormBM")
+      .Input("Input")
+      .AddFloatArg("target_rms", target_rms)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+    net.Sync();
+  }
+}
+}  // namespace
+
+#define MACE_BM_TARGETRMSNORM_MACRO(N, H, W, RMS, TYPE, DEVICE)  \
+  static void                                                    \
+      MACE_BM_TARGETRMSNORM_##N##_##H##_##W##_##TYPE##_##DEVICE( \
+          int iters) {                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W; \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));          \
+    TargetRMSNormBenchmark<DEVICE, TYPE>(iters, N, H, W, RMS);   \
+  }                                                              \
+  MACE_BENCHMARK(                                                \
+      MACE_BM_TARGETRMSNORM_##N##_##H##_##W##_##TYPE##_##DEVICE)
+
+#define MACE_BM_TARGETRMSNORM(N, H, W, RMS)             \
+  MACE_BM_TARGETRMSNORM_MACRO(N, H, W, RMS, float, CPU);
+
+MACE_BM_TARGETRMSNORM(1, 10, 256, 1.0);
+MACE_BM_TARGETRMSNORM(1, 20, 128, 2.0);
+MACE_BM_TARGETRMSNORM(1, 10, 128, 0.5);
+MACE_BM_TARGETRMSNORM(1, 20, 512, 1.0);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/target_rms_norm_test.cc b/mace/ops/target_rms_norm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..950824470cac136a77ef274f5fc2895876b73213
--- /dev/null
+++ b/mace/ops/target_rms_norm_test.cc
@@ -0,0 +1,62 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class TargetRMSNormOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestTargetRMSNorm(const std::vector<index_t> &input_shape,
+                       const std::vector<T> &input,
+                       const float target_rms,
+                       const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+
+  OpDefBuilder("TargetRMSNorm", "TargetRMSNormTest")
+      .Input("Input")
+      .AddFloatArg("target_rms", target_rms)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", input_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(TargetRMSNormOpTest, SimpleTest) {
+  TestTargetRMSNorm<DeviceType::CPU, float>(
+    {1, 3, 3},
+    {1, 2, 3,
+     2, 3, 4,
+     3, 4, 5},
+     1.0,
+    {0.46291, 0.92582, 1.38873,
+     0.64327, 0.9649, 1.28654,
+     0.734847, 0.979796, 1.224745});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/testing/test_utils.h b/mace/ops/testing/test_utils.h
index 59e71448f6421d51a8d67f77d17e49420fe7a915..6a0a045b6326a67689f9755bc911a2f54fbc798a 100644
--- a/mace/ops/testing/test_utils.h
+++ b/mace/ops/testing/test_utils.h
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "mace/core/tensor.h"
+#include "gtest/gtest.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/time_offset.cc b/mace/ops/time_offset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9343fc327438a965fe4b3e98a583783a6d4993a
--- /dev/null
+++ b/mace/ops/time_offset.cc
@@ -0,0 +1,81 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This Op is for offset descriptor in Kaldi.
+// It defines time offset.
+
+#include <functional>
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/utils/math.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class TimeOffsetOp;
+
+template <typename T>
+class TimeOffsetOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit TimeOffsetOp(OpConstructContext *context)
+      : Operation(context),
+        offset_(Operation::GetOptionalArg<int>("offset", 0)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    index_t rank = input->dim_size();
+    MACE_CHECK(rank >= 2, "input's rank should >= 2.");
+    const std::vector<index_t> &input_shape = input->shape();
+    const index_t batch =
+        std::accumulate(input_shape.begin(), input_shape.end() - 2, 1,
+                        std::multiplies<index_t>());
+    const index_t frames = input_shape[rank - 2];
+    const index_t input_dim = input_shape[rank - 1];
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+#pragma omp parallel for collapse(2) schedule(runtime)
+    for (index_t i = 0; i < batch; ++i) {
+      for (index_t j = 0; j < frames; ++j) {
+        index_t time_index = offset_ + j;
+        index_t index = Clamp<index_t>(time_index, 0, frames - 1);
+        T *output_base = output_data + (i * frames + j) * input_dim;
+        const T *input_base = input_data + (i * frames + index) * input_dim;
+        memcpy(output_base, input_base, input_dim * sizeof(T));
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  int offset_;
+};
+
+void RegisterTimeOffset(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "TimeOffset", TimeOffsetOp,
+                   DeviceType::CPU, float);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/time_offset_benchmark.cc b/mace/ops/time_offset_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82ea9967a9bd95542f012666593e81005cd64c48
--- /dev/null
+++ b/mace/ops/time_offset_benchmark.cc
@@ -0,0 +1,78 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template<DeviceType D, typename T>
+void TimeOffsetBenchmark(int iters,
+                         std::vector<index_t> shape,
+                         int offset) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", shape);
+
+  OpDefBuilder("TimeOffset", "TimeOffsetBM")
+    .Input("Input")
+    .Output("Output")
+    .AddIntArg("offset", offset)
+    .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_TIMEOFFSET2D_MACRO(H, W, TYPE, DEVICE)              \
+  static void MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE(\
+      int iters) {                                                     \
+    const int64_t tot = static_cast<int64_t>(iters) * H * W;           \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                \
+    TimeOffsetBenchmark<DEVICE, TYPE>(iters, {H, W}, 1);               \
+  }                                                                    \
+  MACE_BENCHMARK(MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE)   \
+
+#define MACE_BM_TIMEOFFSET2D(H, W)                           \
+  MACE_BM_TIMEOFFSET2D_MACRO(H, W, float, CPU);
+
+
+MACE_BM_TIMEOFFSET2D(20, 128);
+MACE_BM_TIMEOFFSET2D(40, 512);
+MACE_BM_TIMEOFFSET2D(1, 1024);
+MACE_BM_TIMEOFFSET2D(20, 2048);
+MACE_BM_TIMEOFFSET2D(20, 512);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/time_offset_test.cc b/mace/ops/time_offset_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b32b8c52acf3b8af715dac74f92d4a87efe1a102
--- /dev/null
+++ b/mace/ops/time_offset_test.cc
@@ -0,0 +1,125 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class TimeOffsetOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T>
+void TestTimeOffset(const std::vector<index_t> &input_shape,
+                    const std::vector<T> &input,
+                    const int offset,
+                    const std::vector<T> &output) {
+  OpsTestNet net;
+  net.AddInputFromArray<CPU, T>(MakeString("Input"),
+                                input_shape,
+                                input);
+
+  OpDefBuilder("TimeOffset", "TimeOffsetTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("offset", offset)
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp();
+
+  net.AddInputFromArray<CPU, T>("ExpectedOutput", input_shape, output);
+  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
+                      *net.GetOutput("Output"));
+}
+}  // namespace
+
+TEST_F(TimeOffsetOpTest, Simple2Dim) {
+  TestTimeOffset<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    -2,
+    {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    -1,
+    {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    0,
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    1,
+    {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    2,
+    {11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
+}
+
+
+TEST_F(TimeOffsetOpTest, Simple3Dim) {
+  TestTimeOffset<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    -2,
+    {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5,
+     1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    -1,
+    {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    0,
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    1,
+    {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15,
+     6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
+
+  TestTimeOffset<DeviceType::CPU, float>(
+    {2, 3, 5},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    2,
+    {11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15,
+     11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15});
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/port/BUILD.bazel b/mace/port/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..d23633a6a6290f109c1061191bcbf48d81aa2fa9
--- /dev/null
+++ b/mace/port/BUILD.bazel
@@ -0,0 +1,52 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "port",
+    deps = [
+        "//mace/port/android:port_android",
+        "//mace/port/darwin:port_darwin",
+        "//mace/port/linux:port_linux",
+    ],
+)
+
+cc_library(
+    name = "port_api",
+    hdrs = [
+        "env.h",
+        "file_system.h",
+        "logger.h",
+    ],
+    deps = [
+        "//mace/public",
+    ],
+)
+
+cc_library(
+    name = "port_base",
+    srcs = [
+        "env.cc",
+        "logger.cc",
+    ],
+    deps = [
+        ":port_api",
+        "//mace/utils",
+    ],
+)
+
+cc_test(
+    name = "port_test",
+    testonly = 1,
+    srcs = glob([
+        "*_test.cc",
+    ]),
+    linkstatic = 1,
+    deps = [
+        ":port",
+        "@gtest//:gtest",
+        "@gtest//:gtest_main",
+    ],
+)
diff --git a/mace/port/README.md b/mace/port/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ecfff01571db8bcfa4733e1f4e4979763b81d8d
--- /dev/null
+++ b/mace/port/README.md
@@ -0,0 +1,14 @@
+# port
+
+This module contains the interface and implementations for different platforms.
+All platform specific code should go here. It's not allowed to use non standard
+headers in other modules.
+
+This module splits into `port_api` and `port`. `port_api` is the interface, and
+it should not depends on any other modules including `utils`.
+
+If the code base goes large in the future, it should be split into core and
+test to keep the footprint for production libs as small as possible.
+
+Currently Linux, Darwin (MacOS, iOS etc.) are treated as POSIX. They will be
+handled differently if needed.
diff --git a/mace/port/android/BUILD.bazel b/mace/port/android/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..fd5aacc51f3653a32a6fa4b5f5752772d6dd20bc
--- /dev/null
+++ b/mace/port/android/BUILD.bazel
@@ -0,0 +1,22 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//mace:mace.bzl", "if_android")
+
+cc_library(
+    name = "port_android",
+    srcs = if_android(glob([
+        "*.cc",
+    ])),
+    hdrs = if_android(glob([
+        "*.h",
+    ])),
+    deps = [
+        "//mace/port:port_base",
+        "//mace/port/posix:port_posix",
+    ],
+    alwayslink = 1,
+)
diff --git a/mace/port/android/env.cc b/mace/port/android/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2940d344cf3a2d8f3b2fdafe72ef85904e4db442
--- /dev/null
+++ b/mace/port/android/env.cc
@@ -0,0 +1,204 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/android/env.h"
+
+#include <errno.h>
+#include <unwind.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#ifdef __hexagon__
+#include <HAP_perf.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include <cstdint>
+#include <memory>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include "mace/port/android/malloc_logger.h"
+#include "mace/port/posix/time.h"
+#include "mace/utils/macros.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace port {
+
+int64_t AndroidEnv::NowMicros() {
+#ifdef __hexagon__
+  return HAP_perf_get_time_us();
+#else
+  return mace::port::posix::NowMicros();
+#endif
+}
+
+FileSystem *AndroidEnv::GetFileSystem() {
+  return &posix_file_system_;
+}
+
+LogWriter *AndroidEnv::GetLogWriter() {
+  return &log_writer_;
+}
+
+namespace {
+
+int GetCPUCount() {
+  int cpu_count = 0;
+  std::string cpu_sys_conf = "/proc/cpuinfo";
+  std::ifstream f(cpu_sys_conf);
+  if (!f.is_open()) {
+    LOG(ERROR) << "failed to open " << cpu_sys_conf;
+    return -1;
+  }
+  std::string line;
+  const std::string processor_key = "processor";
+  while (std::getline(f, line)) {
+    if (line.size() >= processor_key.size()
+        && line.compare(0, processor_key.size(), processor_key) == 0) {
+      ++cpu_count;
+    }
+  }
+  if (f.bad()) {
+    LOG(ERROR) << "failed to read " << cpu_sys_conf;
+  }
+  if (!f.eof()) {
+    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
+  }
+  f.close();
+  VLOG(1) << "CPU cores: " << cpu_count;
+  return cpu_count;
+}
+
+struct BacktraceState {
+  void** current;
+  void** end;
+};
+
+_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context* context, void* arg) {
+  BacktraceState* state = static_cast<BacktraceState*>(arg);
+  uintptr_t pc = _Unwind_GetIP(context);
+  if (pc) {
+    if (state->current == state->end) {
+      return _URC_END_OF_STACK;
+    } else {
+      *state->current++ = reinterpret_cast<void*>(pc);
+    }
+  }
+  return _URC_NO_REASON;
+}
+
+size_t BackTrace(void** buffer, size_t max) {
+  BacktraceState state = {buffer, buffer + max};
+  _Unwind_Backtrace(UnwindCallback, &state);
+
+  return state.current - buffer;
+}
+
+}  // namespace
+
+MaceStatus AndroidEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
+  MACE_CHECK_NOTNULL(max_freqs);
+  int cpu_count = GetCPUCount();
+  if (cpu_count < 0) {
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+  for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+    std::string cpuinfo_max_freq_sys_conf = MakeString(
+        "/sys/devices/system/cpu/cpu",
+        cpu_id,
+        "/cpufreq/cpuinfo_max_freq");
+    std::ifstream f(cpuinfo_max_freq_sys_conf);
+    if (!f.is_open()) {
+      LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
+      return MaceStatus::MACE_RUNTIME_ERROR;
+    }
+    std::string line;
+    if (std::getline(f, line)) {
+      float freq = strtof(line.c_str(), nullptr);
+      max_freqs->push_back(freq);
+    }
+    if (f.bad()) {
+      LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
+    }
+    f.close();
+  }
+
+  VLOG(1) << "CPU freq: " << MakeString(*max_freqs);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus AndroidEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
+  // compute mask
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  for (auto cpu_id : cpu_ids) {
+    CPU_SET(cpu_id, &mask);
+  }
+  pid_t pid = gettid();
+  int err = sched_setaffinity(pid, sizeof(mask), &mask);
+  if (err) {
+    LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno);
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "SchedSetAffinity failed: " +
+                      std::string(strerror(errno)));
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+std::vector<std::string> AndroidEnv::GetBackTraceUnsafe(int max_steps) {
+  std::vector<void *> buffer(max_steps, 0);
+  int steps = BackTrace(buffer.data(), max_steps);
+
+  std::vector<std::string> bt;
+  for (int i = 0; i < steps; ++i) {
+    std::ostringstream os;
+
+    const void* addr = buffer[i];
+    const char* symbol = "";
+    Dl_info info;
+    if (dladdr(addr, &info) && info.dli_sname) {
+      symbol = info.dli_sname;
+    }
+
+    os << "pc " << addr << " " << symbol;
+
+    bt.push_back(os.str());
+  }
+
+  return bt;
+}
+
+std::unique_ptr<MallocLogger> AndroidEnv::NewMallocLogger(
+    std::ostringstream *oss,
+    const std::string &name) {
+  return make_unique<AndroidMallocLogger>(oss, name);
+}
+
+Env *Env::Default() {
+  static AndroidEnv android_env;
+  return &android_env;
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/android/env.h b/mace/port/android/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..c51c57727d999ee2709fa14302ac51a7dbe021cf
--- /dev/null
+++ b/mace/port/android/env.h
@@ -0,0 +1,49 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_ANDROID_ENV_H_
+#define MACE_PORT_ANDROID_ENV_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/port/android/logger.h"
+#include "mace/port/posix/file_system.h"
+#include "mace/port/env.h"
+
+namespace mace {
+namespace port {
+
+class AndroidEnv : public Env {
+ public:
+  int64_t NowMicros() override;
+  MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
+  MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
+  FileSystem *GetFileSystem() override;
+  LogWriter *GetLogWriter() override;
+  std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
+  std::unique_ptr<MallocLogger> NewMallocLogger(
+      std::ostringstream *oss,
+      const std::string &name) override;
+
+ private:
+  PosixFileSystem posix_file_system_;
+  AndroidLogWriter log_writer_;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_ANDROID_ENV_H_
diff --git a/mace/port/android/logger.cc b/mace/port/android/logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6f57f4336f3313ef7624d1e32615ad7ba725d9a
--- /dev/null
+++ b/mace/port/android/logger.cc
@@ -0,0 +1,58 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/android/logger.h"
+
+#include <android/log.h>
+
+#include <iostream>
+
+namespace mace {
+namespace port {
+
+void AndroidLogWriter::WriteLogMessage(const char *fname,
+                                       const int line,
+                                       const LogLevel severity,
+                                       const char *message) {
+  int android_log_level;
+  switch (severity) {
+    case INFO:
+      android_log_level = ANDROID_LOG_INFO;
+      break;
+    case WARNING:
+      android_log_level = ANDROID_LOG_WARN;
+      break;
+    case ERROR:
+      android_log_level = ANDROID_LOG_ERROR;
+      break;
+    case FATAL:
+      android_log_level = ANDROID_LOG_FATAL;
+      break;
+    default:
+      android_log_level = ANDROID_LOG_ERROR;
+      break;
+  }
+
+  std::stringstream ss;
+  const char *const partial_name = strrchr(fname, '/');
+  ss << (partial_name != nullptr ? partial_name + 1 : fname) << ":" << line
+     << " " << message;
+  __android_log_write(android_log_level, "MACE", ss.str().c_str());
+
+  // Also log to stderr (for standalone Android apps) and abort.
+  LogWriter::WriteLogMessage(fname, line, severity, message);
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/android/logger.h b/mace/port/android/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..fccfb83515c360bf61245dfcabfe73776b7702a7
--- /dev/null
+++ b/mace/port/android/logger.h
@@ -0,0 +1,34 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_ANDROID_LOGGER_H_
+#define MACE_PORT_ANDROID_LOGGER_H_
+
+#include "mace/port/logger.h"
+
+namespace mace {
+namespace port {
+
+class AndroidLogWriter : public LogWriter {
+ protected:
+  void WriteLogMessage(const char *fname,
+                       const int line,
+                       const LogLevel severity,
+                       const char *message) override;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_ANDROID_LOGGER_H_
diff --git a/mace/port/android/malloc_logger.cc b/mace/port/android/malloc_logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afaef724309d103ed15ac584b8f41c49d92c363d
--- /dev/null
+++ b/mace/port/android/malloc_logger.cc
@@ -0,0 +1,100 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/android/malloc_logger.h"
+
+#include <malloc.h>
+
+#include <string>
+#include <utility>
+
+namespace mace {
+namespace port {
+
+namespace {
+struct mallinfo LogMallinfoChange(std::ostringstream *oss,
+                                  const std::string &name,
+                                  const struct mallinfo curr,
+                                  const struct mallinfo prev) {
+  if (prev.arena != curr.arena) {
+    (*oss) << "[" << name << "] "
+           << "Non-mmapped space allocated (bytes): " << curr.arena
+           << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
+  }
+  if (prev.ordblks != curr.ordblks) {
+    (*oss) << "[" << name << "] "
+           << "Number of free chunks: " << curr.ordblks << ", diff: "
+           << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
+  }
+  if (prev.smblks != curr.smblks) {
+    (*oss) << "[" << name << "] "
+           << "Number of free fastbin blocks: " << curr.smblks
+           << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
+  }
+  if (prev.hblks != curr.hblks) {
+    (*oss) << "[" << name << "] "
+           << "Number of mmapped regions: " << curr.hblks
+           << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
+  }
+  if (prev.hblkhd != curr.hblkhd) {
+    (*oss) << "[" << name << "] "
+           << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
+           << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
+  }
+  if (prev.usmblks != curr.usmblks) {
+    (*oss) << "[" << name << "] "
+           << "Maximum total allocated space (bytes): " << curr.usmblks
+           << ", diff: "
+           << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
+  }
+  if (prev.fsmblks != curr.fsmblks) {
+    (*oss) << "[" << name << "] "
+           << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
+           << ", diff: "
+           << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
+  }
+  if (prev.uordblks != curr.uordblks) {
+    (*oss) << "[" << name << "] "
+           << "Total allocated space (bytes): " << curr.uordblks
+           << ", diff: "
+           << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
+  }
+  if (prev.fordblks != curr.fordblks) {
+    (*oss) << "[" << name << "] "
+           << "Total free space (bytes): " << curr.fordblks << ", diff: "
+           << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
+  }
+  if (prev.keepcost != curr.keepcost) {
+    (*oss) << "[" << name << "] "
+           << "Top-most, releasable space (bytes): " << curr.keepcost
+           << ", diff: "
+           << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
+  }
+  return curr;
+}
+}  // namespace
+
+AndroidMallocLogger::AndroidMallocLogger(std::ostringstream *oss,
+                                         const std::string &name) :
+  oss_(oss), name_(name) {
+  prev_ = mallinfo();
+}
+
+AndroidMallocLogger::~AndroidMallocLogger() {
+  struct mallinfo curr = mallinfo();
+  LogMallinfoChange(oss_, name_, curr, prev_);
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/android/malloc_logger.h b/mace/port/android/malloc_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bc7052455b1a8445fa8b0719a82c956a6436ea4
--- /dev/null
+++ b/mace/port/android/malloc_logger.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_ANDROID_MALLOC_LOGGER_H_
+#define MACE_PORT_ANDROID_MALLOC_LOGGER_H_
+
+#include <malloc.h>
+
+#include <string>
+
+#include "mace/port/env.h"
+
+namespace mace {
+namespace port {
+
+class AndroidMallocLogger : public MallocLogger {
+ public:
+  explicit AndroidMallocLogger(std::ostringstream *oss,
+                               const std::string &name);
+  ~AndroidMallocLogger() override;
+
+ private:
+  std::ostringstream *oss_;
+  const std::string name_;
+  struct mallinfo prev_;
+};
+
+}  // namespace port
+}  // namespace mace
+
+
+#endif  // MACE_PORT_ANDROID_MALLOC_LOGGER_H_
diff --git a/mace/port/darwin/BUILD.bazel b/mace/port/darwin/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..987dafd16ea22f3f8b5b97052d0672f18c81c98d
--- /dev/null
+++ b/mace/port/darwin/BUILD.bazel
@@ -0,0 +1,22 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//mace:mace.bzl", "if_darwin")
+
+cc_library(
+    name = "port_darwin",
+    srcs = if_darwin(glob([
+        "*.cc",
+    ])),
+    hdrs = if_darwin(glob([
+        "*.h",
+    ])),
+    deps = [
+        "//mace/port:port_base",
+        "//mace/port/posix:port_posix",
+    ],
+    alwayslink = 1,
+)
diff --git a/mace/port/darwin/env.cc b/mace/port/darwin/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f951e64753b9736705b67153a7ef3ba82cb72e73
--- /dev/null
+++ b/mace/port/darwin/env.cc
@@ -0,0 +1,53 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/darwin/env.h"
+
+#include <execinfo.h>
+#include <sys/time.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "mace/port/posix/backtrace.h"
+#include "mace/port/posix/file_system.h"
+#include "mace/port/posix/time.h"
+
+namespace mace {
+namespace port {
+
+int64_t DarwinEnv::NowMicros() {
+  return mace::port::posix::NowMicros();
+}
+
+FileSystem *DarwinEnv::GetFileSystem() {
+  return &posix_file_system_;
+}
+
+LogWriter *DarwinEnv::GetLogWriter() {
+  return &log_writer_;
+}
+
+std::vector<std::string> DarwinEnv::GetBackTraceUnsafe(int max_steps) {
+  return mace::port::posix::GetBackTraceUnsafe(max_steps);
+}
+
+Env *Env::Default() {
+  static DarwinEnv darwin_env;
+  return &darwin_env;
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/darwin/env.h b/mace/port/darwin/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..667cf9f0a0e2f102c1ddc183605eea1f22dfa0c6
--- /dev/null
+++ b/mace/port/darwin/env.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_DARWIN_ENV_H_
+#define MACE_PORT_DARWIN_ENV_H_
+
+#include <string>
+#include <vector>
+
+#include "mace/port/env.h"
+#include "mace/port/logger.h"
+#include "mace/port/posix/file_system.h"
+
+namespace mace {
+namespace port {
+
+class DarwinEnv : public Env {
+ public:
+  int64_t NowMicros() override;
+  FileSystem *GetFileSystem() override;
+  LogWriter *GetLogWriter() override;
+  std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
+
+ private:
+  PosixFileSystem posix_file_system_;
+  LogWriter log_writer_;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_DARWIN_ENV_H_
diff --git a/mace/port/env.cc b/mace/port/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b78e1c82d4d417ccc2d2be9e2dc24cd3867e4cc1
--- /dev/null
+++ b/mace/port/env.cc
@@ -0,0 +1,40 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/env.h"
+
+#include <sstream>
+
+#include "mace/utils/memory.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace port {
+
+MaceStatus Env::GetCPUMaxFreq(std::vector<float> *max_freqs) {
+  return MaceStatus::MACE_UNSUPPORTED;
+}
+
+MaceStatus Env::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
+  return MaceStatus::MACE_UNSUPPORTED;
+}
+
+std::unique_ptr<MallocLogger> Env::NewMallocLogger(
+      std::ostringstream *oss,
+      const std::string &name) {
+  return make_unique<MallocLogger>();
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/env.h b/mace/port/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..af98cc5a5bc61e40fdc52edc04376aac80c2f740
--- /dev/null
+++ b/mace/port/env.h
@@ -0,0 +1,75 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_ENV_H_
+#define MACE_PORT_ENV_H_
+
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace port {
+
+class MallocLogger {
+ public:
+  MallocLogger() = default;
+  virtual ~MallocLogger() = default;
+};
+
+class FileSystem;
+class LogWriter;
+
+class Env {
+ public:
+  virtual int64_t NowMicros() = 0;
+  virtual MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs);
+  virtual MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids);
+  virtual FileSystem *GetFileSystem() = 0;
+  virtual LogWriter *GetLogWriter() = 0;
+  // Return the current backtrace, will allocate memory inside the call
+  // which may fail
+  virtual std::vector<std::string> GetBackTraceUnsafe(int max_steps) = 0;
+  virtual std::unique_ptr<MallocLogger> NewMallocLogger(
+      std::ostringstream *oss,
+      const std::string &name);
+
+  static Env *Default();
+};
+
+}  // namespace port
+
+inline int64_t NowMicros() {
+  return port::Env::Default()->NowMicros();
+}
+
+inline MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) {
+  return port::Env::Default()->GetCPUMaxFreq(max_freqs);
+}
+
+inline MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
+  return port::Env::Default()->SchedSetAffinity(cpu_ids);
+}
+
+inline port::FileSystem *GetFileSystem() {
+  return port::Env::Default()->GetFileSystem();
+}
+
+}  // namespace mace
+
+#endif  // MACE_PORT_ENV_H_
diff --git a/mace/port/env_test.cc b/mace/port/env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d23b5787a231d09722470efcf42f9ce9eedc2c13
--- /dev/null
+++ b/mace/port/env_test.cc
@@ -0,0 +1,41 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/env.h"
+
+#include <gtest/gtest.h>
+
+namespace mace {
+namespace {
+
+class EnvTest : public ::testing::Test {
+};
+
+TEST_F(EnvTest, NowMicros) {
+  EXPECT_GT(NowMicros(), 0);
+}
+
+TEST_F(EnvTest, GetFileSystem) {
+  GetFileSystem();
+}
+
+TEST_F(EnvTest, CPUInfo) {
+  std::vector<float> freq;
+  GetCPUMaxFreq(&freq);
+  std::vector<size_t> cpu_ids;
+  SchedSetAffinity(cpu_ids);
+}
+
+}  // namespace
+}  // namespace mace
diff --git a/mace/port/file_system.h b/mace/port/file_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..91b6f458d3a021c9163e7cc07c1404805f2aae43
--- /dev/null
+++ b/mace/port/file_system.h
@@ -0,0 +1,45 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_FILE_SYSTEM_H_
+#define MACE_PORT_FILE_SYSTEM_H_
+
+#include <string>
+#include <memory>
+
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace port {
+
+class ReadOnlyMemoryRegion {
+ public:
+  ReadOnlyMemoryRegion() = default;
+  virtual ~ReadOnlyMemoryRegion() = default;
+  virtual const void *data() = 0;
+  virtual uint64_t length() = 0;
+};
+
+class FileSystem {
+ public:
+  FileSystem() = default;
+  virtual ~FileSystem() = default;
+  virtual MaceStatus NewReadOnlyMemoryRegionFromFile(const char *fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) = 0;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_FILE_SYSTEM_H_
diff --git a/mace/port/linux/BUILD.bazel b/mace/port/linux/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..5d1351baf844c4e90f6259fddb97b6217dd769b2
--- /dev/null
+++ b/mace/port/linux/BUILD.bazel
@@ -0,0 +1,22 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//mace:mace.bzl", "if_linux")
+
+cc_library(
+    name = "port_linux",
+    srcs = if_linux(glob([
+        "*.cc",
+    ])),
+    hdrs = if_linux(glob([
+        "*.h",
+    ])),
+    deps = [
+        "//mace/port:port_base",
+        "//mace/port/posix:port_posix",
+    ],
+    alwayslink = 1,
+)
diff --git a/mace/port/linux/env.cc b/mace/port/linux/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a50b4a1198049d5610f3daad5b33f47efb97c4a
--- /dev/null
+++ b/mace/port/linux/env.cc
@@ -0,0 +1,53 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/linux/env.h"
+
+#include <execinfo.h>
+#include <sys/time.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "mace/port/posix/backtrace.h"
+#include "mace/port/posix/file_system.h"
+#include "mace/port/posix/time.h"
+
+namespace mace {
+namespace port {
+
+int64_t LinuxEnv::NowMicros() {
+  return mace::port::posix::NowMicros();
+}
+
+FileSystem *LinuxEnv::GetFileSystem() {
+  return &posix_file_system_;
+}
+
+LogWriter *LinuxEnv::GetLogWriter() {
+  return &log_writer_;
+}
+
+std::vector<std::string> LinuxEnv::GetBackTraceUnsafe(int max_steps) {
+  return mace::port::posix::GetBackTraceUnsafe(max_steps);
+}
+
+Env *Env::Default() {
+  static LinuxEnv linux_env;
+  return &linux_env;
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/linux/env.h b/mace/port/linux/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d1d243a1ab616c3bf13d6d9069147e7cced4519
--- /dev/null
+++ b/mace/port/linux/env.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_LINUX_ENV_H_
+#define MACE_PORT_LINUX_ENV_H_
+
+#include <string>
+#include <vector>
+
+#include "mace/port/env.h"
+#include "mace/port/logger.h"
+#include "mace/port/posix/file_system.h"
+
+namespace mace {
+namespace port {
+
+class LinuxEnv : public Env {
+ public:
+  int64_t NowMicros() override;
+  FileSystem *GetFileSystem() override;
+  LogWriter *GetLogWriter() override;
+  std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
+
+ private:
+  PosixFileSystem posix_file_system_;
+  LogWriter log_writer_;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_LINUX_ENV_H_
diff --git a/mace/port/logger.cc b/mace/port/logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b02f6f4455d92a275470c0c762edf20c257d2b38
--- /dev/null
+++ b/mace/port/logger.cc
@@ -0,0 +1,115 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/logger.h"
+
+#include <cstdlib>
+#include <iomanip>
+#include <string>
+#include <vector>
+
+#include "mace/port/env.h"
+#include "mace/utils/string_util.h"
+
+namespace mace {
+namespace port {
+
+inline bool IsValidLogLevel(const LogLevel level) {
+  return level > LogLevel::INVALID_MIN &&
+         level < LogLevel::INVALID_MAX;
+}
+
+LogLevel LogLevelFromStr(const char *log_level_str) {
+  if (log_level_str != nullptr) {
+    std::string ls = ToUpper(log_level_str);
+
+    if (ls == "I" || ls == "INFO") {
+      return LogLevel::INFO;
+    }
+    if (ls == "W" || ls == "WARNING") {
+      return LogLevel::WARNING;
+    }
+    if (ls == "E" || ls == "ERROR") {
+      return LogLevel::ERROR;
+    }
+    if (ls == "F" || ls == "FATAL") {
+      return LogLevel::FATAL;
+    }
+  }
+
+  return LogLevel::INVALID_MIN;
+}
+
+char LogLevelToShortStr(LogLevel level) {
+  if (!IsValidLogLevel(level)) {
+    level = LogLevel::INFO;
+  }
+
+  return "IWEF"[static_cast<int>(level) - 1];
+}
+
+int VLogLevelFromStr(const char *vlog_level_str) {
+  if (vlog_level_str != nullptr) {
+    return atoi(vlog_level_str);
+  }
+
+  return 0;
+}
+
+
+void LogWriter::WriteLogMessage(const char *fname,
+                                const int line,
+                                const LogLevel severity,
+                                const char *message) {
+  printf("%c %s:%d] %s\n", LogLevelToShortStr(severity), fname, line, message);
+}
+
+Logger::Logger(const char *fname, int line, LogLevel severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
+void Logger::GenerateLogMessage() {
+  LogWriter *log_writer = Env::Default()->GetLogWriter();
+  log_writer->WriteLogMessage(fname_, line_, severity_, str().c_str());
+
+  // When there is a fatal log, terminate execution
+  if (severity_ == LogLevel::FATAL) {
+    DealWithFatal();
+  }
+}
+
+void Logger::DealWithFatal() {
+  // When there is a fatal log, log the backtrace and abort.
+  LogWriter *log_writer = Env::Default()->GetLogWriter();
+  std::vector<std::string> bt = Env::Default()->GetBackTraceUnsafe(50);
+  if (!bt.empty()) {
+    log_writer->WriteLogMessage(fname_, line_, severity_, "backtrace:");
+    for (size_t i = 0; i < bt.size(); ++i) {
+      std::ostringstream os;
+      os << " " << bt[i];
+      log_writer->WriteLogMessage(fname_, line_, severity_, os.str().c_str());
+    }
+  }
+
+  abort();
+}
+
+Logger::~Logger() {
+  static const LogLevel min_log_level = MinLogLevelFromEnv();
+  if (LogLevelPassThreashold(severity_, min_log_level)) {
+    GenerateLogMessage();
+  }
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/logger.h b/mace/port/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..08bcbbe4a8c3447332d897d602e05d9a38f6659e
--- /dev/null
+++ b/mace/port/logger.h
@@ -0,0 +1,95 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_LOGGER_H_
+#define MACE_PORT_LOGGER_H_
+
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+
+namespace mace {
+
+enum LogLevel {
+  INVALID_MIN = 0,
+  INFO        = 1,
+  WARNING     = 2,
+  ERROR       = 3,
+  FATAL       = 4,
+  INVALID_MAX,
+};
+
+namespace port {
+
+inline bool LogLevelPassThreashold(const LogLevel level,
+                                   const LogLevel threshold) {
+  return level >= threshold;
+}
+
+LogLevel LogLevelFromStr(const char *log_level_str);
+int VLogLevelFromStr(const char *vlog_level_str);
+
+inline LogLevel MinLogLevelFromEnv() {
+  // Read the min log level from env once during the first call to logging.
+  static LogLevel log_level = LogLevelFromStr(getenv("MACE_CPP_MIN_LOG_LEVEL"));
+  return log_level;
+}
+
+inline int MinVLogLevelFromEnv() {
+  // Read the min vlog level from env once during the first call to logging.
+  static int vlog_level = VLogLevelFromStr(getenv("MACE_CPP_MIN_VLOG_LEVEL"));
+  return vlog_level;
+}
+
+class LogWriter {
+ public:
+  LogWriter() = default;
+  virtual ~LogWriter() = default;
+  virtual void WriteLogMessage(const char *fname,
+                               const int line,
+                               const LogLevel severity,
+                               const char *message);
+};
+
+class Logger : public std::ostringstream {
+ public:
+  Logger(const char *fname, int line, LogLevel severity);
+  ~Logger();
+
+ private:
+  void GenerateLogMessage();
+  void DealWithFatal();
+
+  const char *fname_;
+  int line_;
+  LogLevel severity_;
+};
+
+}  // namespace port
+
+// Whether the log level pass the env configured threshold, can be used for
+// short cutting.
+inline bool ShouldGenerateLogMessage(LogLevel severity) {
+  LogLevel threshold = port::MinLogLevelFromEnv();
+  return port::LogLevelPassThreashold(severity, threshold);
+}
+
+inline bool ShouldGenerateVLogMessage(int vlog_level) {
+  int threshold = port::MinVLogLevelFromEnv();
+  return ShouldGenerateLogMessage(INFO) &&
+         vlog_level <= threshold;
+}
+}  // namespace mace
+
+#endif  // MACE_PORT_LOGGER_H_
diff --git a/mace/port/logger_test.cc b/mace/port/logger_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93df626ba8a2d61821dd68f07b3a3823fff8a5de
--- /dev/null
+++ b/mace/port/logger_test.cc
@@ -0,0 +1,44 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/logger.h"
+
+#include <gtest/gtest.h>
+
+namespace mace {
+namespace {
+
+class LoggerTest : public ::testing::Test {
+};
+
+TEST_F(LoggerTest, LogLevel) {
+  EXPECT_EQ(INFO, port::LogLevelFromStr("i"));
+  EXPECT_EQ(INFO, port::LogLevelFromStr("I"));
+  EXPECT_EQ(INFO, port::LogLevelFromStr("INFO"));
+
+  EXPECT_EQ(WARNING, port::LogLevelFromStr("w"));
+  EXPECT_EQ(WARNING, port::LogLevelFromStr("W"));
+  EXPECT_EQ(WARNING, port::LogLevelFromStr("WARNING"));
+
+  EXPECT_EQ(ERROR, port::LogLevelFromStr("e"));
+  EXPECT_EQ(ERROR, port::LogLevelFromStr("E"));
+  EXPECT_EQ(ERROR, port::LogLevelFromStr("ERROR"));
+
+  EXPECT_EQ(FATAL, port::LogLevelFromStr("f"));
+  EXPECT_EQ(FATAL, port::LogLevelFromStr("F"));
+  EXPECT_EQ(FATAL, port::LogLevelFromStr("FATAL"));
+}
+
+}  // namespace
+}  // namespace mace
diff --git a/mace/port/posix/BUILD.bazel b/mace/port/posix/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..321a18a516d88749bde3e5cf2677fda941f12480
--- /dev/null
+++ b/mace/port/posix/BUILD.bazel
@@ -0,0 +1,19 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "port_posix",
+    srcs = glob([
+        "*.cc",
+    ]),
+    hdrs = glob([
+        "*.h",
+    ]),
+    deps = [
+        "//mace/port:port_base",
+        "//mace/utils",
+    ],
+)
diff --git a/mace/port/posix/backtrace.h b/mace/port/posix/backtrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..d96419319f874b34149a25493ca44ecd22680976
--- /dev/null
+++ b/mace/port/posix/backtrace.h
@@ -0,0 +1,45 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_POSIX_BACKTRACE_H_
+#define MACE_PORT_POSIX_BACKTRACE_H_
+
+#include <execinfo.h>
+
+#include <string>
+#include <vector>
+
+namespace mace {
+namespace port {
+namespace posix {
+
+inline std::vector<std::string> GetBackTraceUnsafe(int max_steps) {
+  std::vector<void *> buffer(max_steps, 0);
+  int steps = backtrace(buffer.data(), max_steps);
+
+  std::vector<std::string> bt;
+  char **symbols = backtrace_symbols(buffer.data(), steps);
+  if (symbols != nullptr) {
+    for (int i = 0; i < steps; i++) {
+      bt.push_back(symbols[i]);
+    }
+  }
+  return bt;
+}
+
+}  // namespace posix
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_POSIX_BACKTRACE_H_
diff --git a/mace/port/posix/file_system.cc b/mace/port/posix/file_system.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7873b9635e9754568df63ccd7a23491e5d49f30
--- /dev/null
+++ b/mace/port/posix/file_system.cc
@@ -0,0 +1,80 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/port/posix/file_system.h"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace port {
+
+namespace {
+class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
+ public:
+  PosixReadOnlyMemoryRegion() = delete;
+  PosixReadOnlyMemoryRegion(const void* addr, uint64_t length)
+    : addr_(addr), length_(length) {}
+  ~PosixReadOnlyMemoryRegion() override {
+    if (length_ > 0) {
+      munmap(const_cast<void *>(addr_), length_);
+    }
+  };
+  const void *data() override { return addr_; };
+  uint64_t length() override { return length_; };
+
+ private:
+  const void *addr_;
+  const uint64_t length_;
+};
+}  // namespace
+
+MaceStatus PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
+    const char *fname,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+  MaceStatus s = MaceStatus(MaceStatus::MACE_SUCCESS);
+  int fd = open(fname, O_RDONLY);
+  if (fd < 0) {
+    // TODO(heliangliang) check errno
+    s = MaceStatus(MaceStatus::MACE_RUNTIME_ERROR);
+  } else {
+    struct stat st;
+    fstat(fd, &st);
+    if (st.st_size > 0) {
+      const void* address =
+        mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+      if (address == MAP_FAILED) {
+        // TODO(heliangliang) check errno
+        s = MaceStatus(MaceStatus::MACE_RUNTIME_ERROR);
+      } else {
+        *result = make_unique<PosixReadOnlyMemoryRegion>(address, st.st_size);
+      }
+      close(fd);
+    } else {
+      // Empty file: mmap returns EINVAL (since Linux 2.6.12) length was 0
+      *result = make_unique<PosixReadOnlyMemoryRegion>(nullptr, 0);
+    }
+  }
+  return s;
+}
+
+}  // namespace port
+}  // namespace mace
diff --git a/mace/port/posix/file_system.h b/mace/port/posix/file_system.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eb370757fcce9a558b993ac7f80e2d0ca1d2024
--- /dev/null
+++ b/mace/port/posix/file_system.h
@@ -0,0 +1,37 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_PORT_POSIX_FILE_SYSTEM_H_
+#define MACE_PORT_POSIX_FILE_SYSTEM_H_
+
+#include <string>
+#include <memory>
+
+#include "mace/port/file_system.h"
+
+namespace mace {
+namespace port {
+
+class PosixFileSystem : public FileSystem {
+ public:
+  PosixFileSystem() = default;
+  ~PosixFileSystem() override = default;
+  MaceStatus NewReadOnlyMemoryRegionFromFile(const char *fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+};
+
+}  // namespace port
+}  // namespace mace
+
+#endif  // MACE_PORT_POSIX_FILE_SYSTEM_H_
diff --git a/mace/utils/env_time.h b/mace/port/posix/time.h
similarity index 72%
rename from mace/utils/env_time.h
rename to mace/port/posix/time.h
index 18d6e5a6ad6229284a2ae2e3e2fbbeb50fc952d7..84ab478a9580ad67618d53517a8f87afc4f2699b 100644
--- a/mace/utils/env_time.h
+++ b/mace/port/posix/time.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,28 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_UTILS_ENV_TIME_H_
-#define MACE_UTILS_ENV_TIME_H_
+#ifndef MACE_PORT_POSIX_TIME_H_
+#define MACE_PORT_POSIX_TIME_H_
 
-#include <stdint.h>
-#ifdef __hexagon__
-#include <HAP_perf.h>
-#else
 #include <sys/time.h>
-#endif
+
+#include <cstddef>
 
 namespace mace {
+namespace port {
+namespace posix {
 
 inline int64_t NowMicros() {
-#ifdef __hexagon__
-  return HAP_perf_get_time_us();
-#else
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
 }
 
+}  // namespace posix
+}  // namespace port
 }  // namespace mace
 
-#endif  // MACE_UTILS_ENV_TIME_H_
+#endif  // MACE_PORT_POSIX_TIME_H_
diff --git a/mace/proto/BUILD b/mace/proto/BUILD.bazel
similarity index 100%
rename from mace/proto/BUILD
rename to mace/proto/BUILD.bazel
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 530de3aedfcd6a94d9ee840f8e368a4447d6cd8c..d3b564fc6a9de2b7b79f9c73df53b3fa9e310788 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -86,21 +86,15 @@ message OperatorDef {
 }
 
 // for hexagon mace-nnlib
-message InputInfo {
-  optional string name = 1;
-  optional int32 node_id = 2;
-  repeated int32 dims = 3;
-  optional int32 max_byte_size = 4;  // only support 32-bit len
-  optional DataType data_type = 5 [default = DT_FLOAT];
-  optional int32 data_format = 6 [default = 1];  // NHWC
-}
-message OutputInfo {
+message InputOutputInfo {
   optional string name = 1;
   optional int32 node_id = 2;
   repeated int32 dims = 3;
   optional int32 max_byte_size = 4;  // only support 32-bit len
   optional DataType data_type = 5 [default = DT_FLOAT];
   optional int32 data_format = 6 [default = 1];  // NHWC
+  optional float scale = 7;
+  optional int32 zero_point = 8;
 }
 
 message NetDef {
@@ -109,6 +103,6 @@ message NetDef {
   repeated ConstTensor tensors = 3;
 
   // for hexagon mace-nnlib
-  repeated InputInfo input_info = 100;
-  repeated OutputInfo output_info = 101;
+  repeated InputOutputInfo input_info = 100;
+  repeated InputOutputInfo output_info = 101;
 }
diff --git a/mace/public/BUILD b/mace/public/BUILD.bazel
similarity index 87%
rename from mace/public/BUILD
rename to mace/public/BUILD.bazel
index b434312bcfdd4ec65a78bfc879a2dfcb41cc129c..158bc564dff7c4118ff368d0dfd1cb6a0eb0547f 100644
--- a/mace/public/BUILD
+++ b/mace/public/BUILD.bazel
@@ -12,5 +12,8 @@ cc_library(
     hdrs = [
         "mace.h",
     ],
+    srcs = [
+        "status.cc",
+    ],
     copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
 )
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 575ca32877374badf249a3b7bcad89f2e740793e..c265401ed3ca3f0eb88a51ed03ab206aa2c7c2b3 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -32,9 +32,12 @@ namespace mace {
 
 class NetDef;
 
-enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
+enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
 
-enum DataFormat { DF_NONE = 0, NHWC = 1, NCHW = 2};
+enum DataFormat {
+  DF_NONE = 0, NHWC = 1, NCHW = 2,
+  HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103
+};
 
 enum GPUPerfHint {
   PERF_DEFAULT = 0,
@@ -102,7 +105,7 @@ class RunMetadata {
 
 /// Consistent with Android NNAPI
 struct PerformanceInfo {
-  // Time of executing some workload.
+  // Time of executing some workload(millisecond).
   // negative value for unsupported.
   float exec_time;
 };
@@ -144,7 +147,9 @@ class MaceStatus {
   enum Code {
     MACE_SUCCESS = 0,
     MACE_INVALID_ARGS = 1,
-    MACE_OUT_OF_RESOURCES = 2
+    MACE_OUT_OF_RESOURCES = 2,
+    MACE_UNSUPPORTED = 3,
+    MACE_RUNTIME_ERROR = 4,
   };
 
  public:
@@ -167,18 +172,6 @@ class MaceStatus {
   std::unique_ptr<Impl> impl_;
 };
 
-
-#define MACE_RETURN_IF_ERROR(stmt)                                         \
-  {                                                                        \
-    MaceStatus status = (stmt);                                            \
-    if (status != MaceStatus::MACE_SUCCESS) {                              \
-      VLOG(0) << "Mace runtime failure: "                                  \
-              << __FILE__ << ":" << __LINE__ << ". "                       \
-              << status.information();                                     \
-      return status;                                                       \
-    }                                                                      \
-  }
-
 /// \brief GPU context contain the status used for GPU device.
 ///
 /// There are some data in common between different MaceEngines using GPU,
diff --git a/mace/core/status.cc b/mace/public/status.cc
similarity index 86%
rename from mace/core/status.cc
rename to mace/public/status.cc
index 12134f88a73940e26c8eb6c70a65011dcb25d647..c377c9b64112750bd9e46f53bdccf664b1aa8ca3 100644
--- a/mace/core/status.cc
+++ b/mace/public/status.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
+
 #include "mace/public/mace.h"
 
 namespace mace {
@@ -26,10 +28,16 @@ class MaceStatus::Impl {
   void SetCode(const Code code) { code_ = code; }
   Code code() const { return code_; }
   void SetInformation(const std::string &info) { information_ = info; }
-  std::string information() const { return Code2Str() + ": " + information_; }
+  std::string information() const {
+    if (information_.empty()) {
+      return CodeToString();
+    } else {
+      return CodeToString() + ": " + information_;
+    }
+  }
 
  private:
-  std::string Code2Str() const {
+  std::string CodeToString() const {
     switch (code_) {
       case MaceStatus::MACE_SUCCESS:
         return "Success";
@@ -37,8 +45,14 @@ class MaceStatus::Impl {
         return "Invalid Arguments";
       case MaceStatus::MACE_OUT_OF_RESOURCES:
         return "Out of resources";
+      case MACE_UNSUPPORTED:
+        return "Unsupported";
+      case MACE_RUNTIME_ERROR:
+        return "Runtime error";
       default:
-        return "";
+        std::ostringstream os;
+        os << code_;
+        return os.str();
     }
   }
 
diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD.bazel
similarity index 100%
rename from mace/python/tools/BUILD
rename to mace/python/tools/BUILD.bazel
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 56f2cceca6863672fa168209504187142ad83d05..0de68ce4f6af1c0ae6c995e77738015b998dafba 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -37,11 +37,14 @@ FLAGS = None
 device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                    'gpu': cvt.DeviceType.GPU.value,
                    'dsp': cvt.DeviceType.HEXAGON.value,
+                   'hta': cvt.DeviceType.HTA.value,
                    'cpu+gpu': cvt.DeviceType.CPU.value}
 
 data_format_map = {
     'NONE': cvt.DataFormat.DF_NONE,
     'NHWC': cvt.DataFormat.NHWC,
+    'NCHW': cvt.DataFormat.NCHW,
+    'OIHW': cvt.DataFormat.OIHW,
 }
 
 
@@ -52,10 +55,11 @@ def parse_data_type(data_type, device_type):
             return mace_pb2.DT_FLOAT
         else:
             return mace_pb2.DT_HALF
-    elif device_type == cvt.DeviceType.HEXAGON.value:
+    elif device_type == cvt.DeviceType.HEXAGON.value or \
+            device_type == cvt.DeviceType.HTA.value:
         return mace_pb2.DT_FLOAT
     else:
-        print("Invalid device type: " + device_type)
+        print("Invalid device type: " + str(device_type))
 
 
 def file_checksum(fname):
@@ -66,12 +70,26 @@ def file_checksum(fname):
     return hash_func.hexdigest()
 
 
+def split_shape(shape):
+    if shape.strip() == "":
+        return []
+    else:
+        return shape.split(',')
+
+
 def parse_int_array_from_str(ints_str):
-    return [int(int_str) for int_str in ints_str.split(',')]
+    return [int(i) for i in split_shape(ints_str)]
+
+
+def parse_float_array_from_str(floats_str):
+    return [float(i) for i in floats_str.split(',')]
 
 
-def parse_float_array_from_str(ints_str):
-    return [float(int_str) for int_str in ints_str.split(',')]
+def transpose_shape(shape, dst_order):
+    t_shape = [0] * len(shape)
+    for i in range(len(shape)):
+        t_shape[i] = shape[dst_order[i]]
+    return t_shape
 
 
 def main(unused_args):
@@ -106,7 +124,7 @@ def main(unused_args):
         six.print_("platform %s is not supported." % FLAGS.platform,
                    file=sys.stderr)
         sys.exit(-1)
-    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'cpu+gpu']:
+    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'hta', 'cpu+gpu']:
         six.print_("runtime %s is not supported." % FLAGS.runtime,
                    file=sys.stderr)
         sys.exit(-1)
@@ -139,6 +157,10 @@ def main(unused_args):
         else:
             input_node.data_format = data_format_map[input_node_formats[i]]
         input_node.shape = parse_int_array_from_str(input_node_shapes[i])
+        if input_node.data_format == cvt.DataFormat.NCHW and\
+                len(input_node.shape) == 4:
+            input_node.shape = transpose_shape(input_node.shape, [0, 2, 3, 1])
+            input_node.data_format = cvt.DataFormat.NHWC
         if len(input_node_ranges) > i:
             input_node.range = parse_float_array_from_str(input_node_ranges[i])
         option.add_input_node(input_node)
@@ -156,6 +178,11 @@ def main(unused_args):
         else:
             output_node.data_format = data_format_map[output_node_formats[i]]
         output_node.shape = parse_int_array_from_str(output_node_shapes[i])
+        if output_node.data_format == cvt.DataFormat.NCHW and\
+                len(output_node.shape) == 4:
+            output_node.shape = transpose_shape(output_node.shape,
+                                                [0, 2, 3, 1])
+            output_node.data_format = cvt.DataFormat.NHWC
         option.add_output_node(output_node)
 
     if FLAGS.check_node != '':
@@ -196,7 +223,8 @@ def main(unused_args):
         option, output_graph_def)
     output_graph_def, quantize_activation_info = mace_transformer.run()
 
-    if FLAGS.runtime == 'dsp':
+    if option.device in [cvt.DeviceType.HEXAGON.value,
+                         cvt.DeviceType.HTA.value]:
         from mace.python.tools.converter_tool import hexagon_converter
         converter = hexagon_converter.HexagonConverter(
             option, output_graph_def, quantize_activation_info)
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 409c3b321b7b9b9a1b8cc1614647468f5e5c0efc..7fc877d662a90bc4d6030daab3843b27cb801f80 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -22,15 +22,13 @@ class DeviceType(Enum):
     CPU = 0
     GPU = 2
     HEXAGON = 3
+    HTA = 4
 
 
 class DataFormat(Enum):
     DF_NONE = 0
     NHWC = 1
     NCHW = 2
-
-
-class FilterFormat(Enum):
     HWIO = 100
     OIHW = 101
     HWOI = 102
@@ -104,6 +102,7 @@ class FrameworkType(Enum):
 MaceSupportedOps = [
     'Activation',
     'AddN',
+    'Affine',
     'ArgMax',
     'BatchNorm',
     'BatchToSpaceND',
@@ -127,9 +126,11 @@ MaceSupportedOps = [
     'InferConv2dShape',
     'LocalResponseNorm',
     'LSTMCell',
+    # 'LstmNonlinear',
     'MatMul',
     'OneHot',
     'Pad',
+    'PNorm',
     'Pooling',
     'PriorBox',
     'Proposal',
@@ -141,6 +142,8 @@ MaceSupportedOps = [
     'ResizeNearestNeighbor',
     'Reverse',
     'ScalarMath',
+    'Slice',
+    'Splice',
     'Split',
     'Shape',
     'Squeeze',
@@ -151,9 +154,13 @@ MaceSupportedOps = [
     'SpaceToBatchND',
     'SpaceToDepth',
     'SqrDiffMean',
+    'SumGroup',
+    'TargetRMSNorm',
+    'TimeOffset',
     'Transpose',
     'WinogradInverseTransform',
     'WinogradTransform',
+    'Cumsum',
 ]
 
 MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str)
@@ -166,6 +173,7 @@ class MaceKeyword(object):
     mace_buffer_type = 'buffer_type'
     # arg related str
     mace_padding_str = 'padding'
+    mace_padding_type_str = 'padding'
     mace_padding_values_str = 'padding_values'
     mace_strides_str = 'strides'
     mace_dilations_str = 'dilations'
@@ -173,6 +181,7 @@ class MaceKeyword(object):
     mace_global_pooling_str = 'global_pooling'
     mace_kernel_str = 'kernels'
     mace_data_format_str = 'data_format'
+    mace_has_data_format_str = 'has_data_format'
     mace_filter_format_str = 'filter_format'
     mace_element_type_str = 'type'
     mace_activation_type_str = 'activation'
@@ -228,7 +237,10 @@ class MaceKeyword(object):
     mace_step_h_str = 'step_h'
     mace_step_w_str = 'step_w'
     mace_find_range_every_time = 'find_range_every_time'
+    mace_non_zero = 'non_zero'
     mace_pad_type_str = 'pad_type'
+    mace_exclusive_str = 'exclusive'
+    mace_reverse_str = 'reverse'
 
 
 class TransformerRule(Enum):
@@ -271,6 +283,7 @@ class TransformerRule(Enum):
     FOLD_FC_RESHAPE = 37
     TRANSFORM_CHANNEL_SHUFFLE = 38
     UPDATE_DATA_FORMAT = 39
+    QUANTIZE_SPECIFIC_OPS_ONLY = 40
 
 
 class ConverterInterface(object):
@@ -481,6 +494,7 @@ class ConverterOption(object):
                 # Model data format related transformation
                 TransformerRule.TRANSPOSE_FILTERS,
                 TransformerRule.TRANSPOSE_DATA_FORMAT,
+                TransformerRule.TRANSPOSE_MATMUL_WEIGHT,
                 # Add winograd argument
                 TransformerRule.ADD_WINOGRAD_ARG,
                 # Mace model structure related transformation
@@ -514,6 +528,16 @@ class ConverterUtil(object):
                 return arg
         return None
 
+    @staticmethod
+    def del_arg(op, arg_name):
+        found_idx = -1
+        for idx in range(len(op.arg)):
+            if op.arg[idx].name == arg_name:
+                found_idx = idx
+                break
+        if found_idx != -1:
+            del op.arg[found_idx]
+
     @staticmethod
     def add_data_format_arg(op, data_format):
         data_format_arg = op.arg.add()
@@ -549,11 +573,11 @@ class ConverterUtil(object):
         arg = ConverterUtil.get_arg(net, MaceKeyword.mace_filter_format_str)
         if arg is None:
             return None
-        elif arg.i == FilterFormat.HWIO.value:
-            return FilterFormat.HWIO
-        elif arg.i == FilterFormat.HWOI.value:
-            return FilterFormat.HWOI
-        elif arg.i == FilterFormat.OIHW.value:
-            return FilterFormat.OIHW
+        elif arg.i == DataFormat.HWIO.value:
+            return DataFormat.HWIO
+        elif arg.i == DataFormat.HWOI.value:
+            return DataFormat.HWOI
+        elif arg.i == DataFormat.OIHW.value:
+            return DataFormat.OIHW
         else:
             return None
diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py
index 3231ea9fa58b9f6e43470250c2997f37b3ed87c3..c5b6176824d28dcf67a4dd68defdebdfecafcbed 100644
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -27,7 +27,6 @@ from mace.python.tools.converter_tool.base_converter import ActivationType
 from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
 from mace.python.tools.converter_tool.base_converter import DataFormat
-from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
@@ -183,6 +182,7 @@ class CaffeConverter(base_converter.ConverterInterface):
             'Slice': self.convert_slice,
             'Softmax': self.convert_softmax,
             'InnerProduct': self.convert_fully_connected,
+            'Interp': self.convert_interp,
             'BatchNorm': self.convert_folded_batchnorm,
             'Crop': self.convert_crop,
             'Scale': self.convert_scale,
@@ -194,7 +194,7 @@ class CaffeConverter(base_converter.ConverterInterface):
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
-        ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW)
+        ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
         self._caffe_net = CaffeNet()
         self._caffe_layers = caffe_pb2.NetParameter()
         caffe_weights = caffe_pb2.NetParameter()
@@ -552,18 +552,20 @@ class CaffeConverter(base_converter.ConverterInterface):
         param = caffe_op.layer.crop_param
         op.type = MaceOp.Crop.name
 
-        axis_arg = op.arg.add()
-        axis_arg.name = MaceKeyword.mace_axis_str
-        axis_arg.i = 2
-        if param.HasField('axis'):
-            axis_arg.i = param.axis
-        axis_arg.i = 4 + axis_arg.i if axis_arg.i < 0 else axis_arg.i
+        axis = param.axis
+        axis = 4 + axis if axis < 0 else axis
+        offset_value = -1 * np.ones(4, dtype=np.int32)
+        offset_len = len(param.offset)
+        if offset_len == 1:
+            while axis < 4:
+                offset_value[axis] = param.offset[0]
+                axis += 1
+        else:
+            offset_value[axis:] = param.offset
+
         offset_arg = op.arg.add()
         offset_arg.name = MaceKeyword.mace_offset_str
-        if len(param.offset) > 0:
-            offset_arg.ints.extend(list(param.offset))
-        else:
-            offset_arg.i = 0
+        offset_arg.ints.extend(offset_value)
 
     def convert_concat(self, caffe_op):
         op = self.convert_general_op(caffe_op)
@@ -573,7 +575,7 @@ class CaffeConverter(base_converter.ConverterInterface):
         axis_arg = op.arg.add()
         axis_arg.name = MaceKeyword.mace_axis_str
         axis_arg.i = 1
-        if param.HasField('axis'):
+        if param.HasField(MaceKeyword.mace_axis_str):
             axis_arg.i = param.axis
         elif param.HasField('concat_dim'):
             axis_arg.i = param.concat_dim
@@ -593,6 +595,18 @@ class CaffeConverter(base_converter.ConverterInterface):
         axis_arg.name = MaceKeyword.mace_axis_str
         axis_arg.i = 1
 
+    def convert_interp(self, caffe_op):
+        op = self.convert_general_op(caffe_op)
+        param = caffe_op.layer.interp_param
+        mace_check(param.HasField("height") and param.HasField("width"),
+                   'Only support bilinear interp with height and width')
+        op.type = MaceOp.ResizeBilinear.name
+
+        size_arg = op.arg.add()
+        size_arg.name = MaceKeyword.mace_resize_size_str
+        size_value = np.array([param.height, param.width], dtype=np.int32)
+        size_arg.ints.extend(size_value)
+
     def convert_fully_connected(self, caffe_op):
         op = self.convert_general_op(caffe_op)
         param = caffe_op.layer.inner_product_param
diff --git a/mace/python/tools/converter_tool/hexagon_converter.py b/mace/python/tools/converter_tool/hexagon_converter.py
index 60226ef887eca9f800ca650eff13feff5fbe11e6..53598243b247094ce43b5a832b65d1498c796547 100644
--- a/mace/python/tools/converter_tool/hexagon_converter.py
+++ b/mace/python/tools/converter_tool/hexagon_converter.py
@@ -20,6 +20,7 @@ from operator import mul
 from mace.proto import mace_pb2
 from mace.python.tools.converter_tool import base_converter
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import DeviceType
 from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
@@ -29,11 +30,15 @@ from mace.python.tools.converter_tool.base_converter import ReduceType
 from mace.python.tools.convert_util import mace_check
 from mace.python.tools import graph_util
 
+from six.moves import reduce
+
 
 HexagonSupportedOps = [
     'BatchToSpaceND_8',
     'DepthwiseSupernode_8x8p32to8',
     'DequantizeOUTPUT_8tof',
+    'INPUT',
+    'OUTPUT',
     'QuantizedAdd_8p8to8',
     'QuantizedAvgPool_8',
     'QuantizedConcat_8',
@@ -126,9 +131,9 @@ class HexagonConverter(base_converter.ConverterInterface):
 
         self.add_input_output_node()
         if not self._option.check_nodes:
-            output_name = self._option.output_nodes.values()[0].name
+            output_name = list(self._option.output_nodes.values())[0].name
         else:
-            output_name = self._option.check_nodes.values()[0].name
+            output_name = list(self._option.check_nodes.values())[0].name
         output_name = normalize_name(output_name)
         self._model = graph_util.sort_mace_graph(self._model, output_name)
 
@@ -330,7 +335,7 @@ class HexagonConverter(base_converter.ConverterInterface):
             else:
                 op.type = self._hexagon_ops.map_nn_op(op.type)
 
-    def add_min_max(self, name, val):
+    def add_const_node(self, name, val):
         if name not in self._consts:
             tensor = self._model.tensors.add()
             self._consts[name] = tensor
@@ -362,14 +367,14 @@ class HexagonConverter(base_converter.ConverterInterface):
                 min_tensor_name = op + ':1'
             else:
                 min_tensor_name = op + '_min:0'
-                self.add_min_max(min_tensor_name, minval)
+                self.add_const_node(min_tensor_name, minval)
             this_op.input.extend([min_tensor_name])
         if add_max:
             if is_activation and diff_port:
                 max_tensor_name = op + ':2'
             else:
                 max_tensor_name = op + '_max:0'
-                self.add_min_max(max_tensor_name, maxval)
+                self.add_const_node(max_tensor_name, maxval)
             this_op.input.extend([max_tensor_name])
 
     def add_shape_const_node(self, op, values, name):
@@ -380,27 +385,48 @@ class HexagonConverter(base_converter.ConverterInterface):
         tensor.dims.extend(values)
         return tensor.name
 
-    def add_input_output_node(self):
-        for op in self._model.op:
-            if op.name.startswith(MaceKeyword.mace_input_node_name):
-                del op.input[0]
-                break
+    def add_constant_min_max_for_first_op(self, op):
+        minval = self._quantize_activation_info[op.input[0]].minval
+        maxval = self._quantize_activation_info[op.input[0]].maxval
+        input_op, _ = get_op_and_port_from_tensor(op.input[0])
+        input_min = input_op + '_min:0'
+        input_max = input_op + '_max:0'
+        self.add_const_node(input_min, minval)
+        self.add_const_node(input_max, maxval)
+        for i in range(len(op.input)):
+            if op.input[i] == input_op + ':1':
+                op.input[i] = input_min
+            elif op.input[i] == input_op + ':2':
+                op.input[i] = input_max
 
-        output_node = None
-        if not self._option.check_nodes:
-            output_name = self._option.output_nodes.values()[0].name
-        else:
-            output_name = self._option.check_nodes.values()[0].name
-        output_name = normalize_name(output_name)
-        for op in self._model.op:
-            if op.name == output_name:
-                output_node = op
-                break
-        mace_check(output_node is not None,
-                   "mace_output_node_* not found.")
-        del output_node.output_shape[:]
-        del output_node.output_type[:]
-        del output_node.out_max_byte_size[:]
+    def add_input_output_node(self):
+        mace_check(
+            self._model.op[0].type == HexagonOp.QuantizeINPUT_f_to_8.name,
+            "Not started with Quantize op.")
+        quantize_input_op = self._model.op[0]
+        del quantize_input_op.input[:]
+
+        mace_check(
+            self._model.op[-1].type == HexagonOp.DequantizeOUTPUT_8tof.name,
+            "Not ended with Dequantize op.")
+        dequantize_output_op = self._model.op[-1]
+        del dequantize_output_op.output_shape[:]
+        del dequantize_output_op.output_type[:]
+        del dequantize_output_op.out_max_byte_size[:]
+
+        if self._option.device == DeviceType.HTA.value:
+            # replace QuantizeINPUT_f_to_8 with INPUT
+            quantize_input_op.type = HexagonOp.INPUT.name
+            del quantize_input_op.output_shape[1:]
+            del quantize_input_op.output_type[1:]
+            del quantize_input_op.out_max_byte_size[1:]
+
+            # replace first op's input min max with constant
+            self.add_constant_min_max_for_first_op(self._model.op[1])
+
+            # replace DequantizeOUTPUT_8tof with OUTPUT
+            dequantize_output_op.type = HexagonOp.OUTPUT.name
+            del dequantize_output_op.input[1:]
 
     def add_node_id(self):
         node_id_counter = 0
diff --git a/mace/python/tools/converter_tool/onnx_converter.py b/mace/python/tools/converter_tool/onnx_converter.py
index 2f3570d59cc0a9ac5cd28ae9c43ab13974f51395..68f781a23dfc4fe5d09163b59422be15fec31f87 100644
--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
@@ -27,27 +27,28 @@ from mace.python.tools.converter_tool.base_converter import ReduceType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
 from mace.python.tools.converter_tool.base_converter import RoundMode
 from mace.python.tools.converter_tool.base_converter import DataFormat
-from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
 from mace.python.tools.convert_util import mace_check
 
+import numpy as np
+
 import onnx
 import onnx.utils
-from onnx import helper, shape_inference, numpy_helper, optimizer
-import numpy as np
-from onnx import mapping
-from onnx import TensorProto
+from onnx import mapping, numpy_helper, TensorProto
 from numbers import Number
 
+IS_PYTHON3 = sys.version_info > (3,)
 
 OnnxSupportedOps = [
     'Abs',
     # 'Acos',
     # 'Acosh',
     'Add',
+    'Affine',
     # 'And',
+    'Append',
     'ArgMax',
     'ArgMin',
     # 'Asin',
@@ -68,6 +69,7 @@ OnnxSupportedOps = [
     # 'Cos',
     # 'Cosh',
     'DepthToSpace',
+    'DimRange',
     'Div',
     'Dropout',
     'Elu',
@@ -88,10 +90,12 @@ OnnxSupportedOps = [
     # 'Hardmax',
     'Identity',
     # 'If',
+    'IfDefined',
     'ImageScaler',
     # 'InstanceNormalization',
     # 'LRN',
-    # 'LSTM',
+    'LSTM',
+    # 'LstmNonlinear',
     'LeakyRelu',
     # 'Less',
     # 'Log',
@@ -109,11 +113,15 @@ OnnxSupportedOps = [
     'Mul',
     # 'Multinomial',
     'Neg',
+    'Normalize',
     # 'Not',
+    'Offset',
     # 'OneHot',
     # 'Or',
     'PRelu',
-    'Pad',
+    # 'Pad',
+    'Padding',
+    'PNorm',
     'Pow',
     # 'RNN',
     # 'RandomNormal',
@@ -133,6 +141,7 @@ OnnxSupportedOps = [
     # 'ReduceSumSquare',
     'Relu',
     'Reshape',
+    'Scale',
     # 'Scan',
     # 'Selu',
     'Shape',
@@ -140,18 +149,21 @@ OnnxSupportedOps = [
     # 'Sin',
     # 'Sinh',
     # 'Size',
-    # 'Slice',
+    'Slice',
     'Softmax',
     # 'Softplus',
     # 'Softsign',
     'SpaceToDepth',
+    'Splice',
     'Split',
     'Sqrt',
     'Squeeze',
     'Sub',
     'Sum',
+    'SumGroup',
     # 'Tan',
     'Tanh',
+    'TargetRMSNorm',
     # 'Tile',
     # 'TopK',
     'Transpose',
@@ -188,7 +200,7 @@ def convert_onnx_attribute_proto(attr_proto):
         return attr_proto.i
     elif attr_proto.HasField('s'):
         return str(attr_proto.s, 'utf-8')\
-            if sys.version_info.major == 3 else attr_proto.s
+            if IS_PYTHON3 else attr_proto.s
     elif attr_proto.HasField('t'):
         return attr_proto.t  # this is a proto!
     elif attr_proto.floats:
@@ -217,6 +229,8 @@ def onnx_dtype(dtype):
 class OnnxNode(object):
     def __init__(self, node):
         self.name = str(node.name)
+        if self.name == '':
+            self.name = str(node.output)
         self.op_type = str(node.op_type)
         self.domain = str(node.domain)
         self.attrs = dict([(attr.name,
@@ -227,14 +241,14 @@ class OnnxNode(object):
         self.node_proto = node
 
     def print_info(self):
-        print "node: ", self.name
-        print "    type: ", self.op_type
-        print "    domain: ", self.domain
-        print "    inputs: ", self.inputs
-        print "    outputs: ", self.outputs
-        print "    attrs:"
+        print("node: ", self.name)
+        print("    type: ", self.op_type)
+        print("    domain: ", self.domain)
+        print("    inputs: ", self.inputs)
+        print("    outputs: ", self.outputs)
+        print("    attrs:")
         for arg in self.attrs:
-            print "        %s: %s" % (arg, self.attrs[arg])
+            print("        %s: %s" % (arg, self.attrs[arg]))
 
 
 class OnnxTensor(object):
@@ -273,6 +287,7 @@ class OnnxConverter(base_converter.ConverterInterface):
         OnnxOpType.Equal.name: EltwiseType.EQUAL,
         OnnxOpType.Sqrt.name: EltwiseType.POW,
         OnnxOpType.Reciprocal.name: EltwiseType.POW,
+        OnnxOpType.Scale.name: EltwiseType.PROD,
     }
 
     reduce_type = {
@@ -296,6 +311,8 @@ class OnnxConverter(base_converter.ConverterInterface):
         self._op_converters = {
             OnnxOpType.Abs.name: self.convert_eltwise,
             OnnxOpType.Add.name: self.convert_eltwise,
+            OnnxOpType.Affine.name: self.convert_affine,
+            OnnxOpType.Append.name: self.convert_concat,
             OnnxOpType.ArgMax.name: self.convert_argmax,
             OnnxOpType.ArgMin.name: self.convert_argmax,
             OnnxOpType.AveragePool.name: self.convert_pooling,
@@ -306,6 +323,7 @@ class OnnxConverter(base_converter.ConverterInterface):
             OnnxOpType.ConvTranspose.name: self.convert_deconv,
             OnnxOpType.DepthToSpace.name: self.convert_depth_space,
             OnnxOpType.Dropout.name: self.convert_identity,
+            OnnxOpType.DimRange.name: self.convert_dim_range,
             OnnxOpType.Div.name: self.convert_eltwise,
             OnnxOpType.Equal.name: self.convert_eltwise,
             OnnxOpType.Gather.name: self.convert_gather,
@@ -313,53 +331,77 @@ class OnnxConverter(base_converter.ConverterInterface):
             OnnxOpType.GlobalAveragePool.name: self.convert_reduce,
             OnnxOpType.GlobalMaxPool.name: self.convert_reduce,
             OnnxOpType.Identity.name: self.convert_identity,
+            OnnxOpType.IfDefined.name: self.convert_identity,
             OnnxOpType.ImageScaler.name: self.convert_imagescaler,
             OnnxOpType.LeakyRelu.name: self.convert_activation,
+            # OnnxOpType.LogSoftmax.name: self.convert_softmax,
+            OnnxOpType.LSTM.name: self.convert_lstm,
+            # OnnxOpType.LstmNonlinear.name: self.convert_lstm_nonlinear,
             OnnxOpType.Max.name: self.convert_eltwise,
             OnnxOpType.MaxPool.name: self.convert_pooling,
             OnnxOpType.MatMul.name: self.convert_matmul,
             OnnxOpType.Min.name: self.convert_eltwise,
             OnnxOpType.Mul.name: self.convert_eltwise,
             OnnxOpType.Neg.name: self.convert_eltwise,
-            OnnxOpType.Pad.name: self.convert_pad,
+            OnnxOpType.Normalize: self.convert_normalize,
+            OnnxOpType.Offset.name: self.convert_timeoffset,
+            OnnxOpType.Padding.name: self.convert_identity,
+            OnnxOpType.PNorm.name: self.convert_pnorm,
             OnnxOpType.Pow.name: self.convert_eltwise,
             OnnxOpType.PRelu.name: self.convert_activation,
             OnnxOpType.Relu.name: self.convert_activation,
             OnnxOpType.Reshape.name: self.convert_reshape,
             OnnxOpType.Reciprocal.name: self.convert_eltwise,
+            OnnxOpType.Scale.name: self.convert_eltwise,
             OnnxOpType.Sigmoid.name: self.convert_activation,
+            OnnxOpType.Slice.name: self.convert_slice,
             OnnxOpType.Softmax.name: self.convert_softmax,
             OnnxOpType.SpaceToDepth.name: self.convert_depth_space,
+            OnnxOpType.Splice.name: self.convert_splice,
             OnnxOpType.Split.name: self.convert_split,
             OnnxOpType.Sqrt.name: self.convert_eltwise,
             OnnxOpType.Squeeze.name: self.convert_squeeze,
             OnnxOpType.Sub.name: self.convert_eltwise,
             OnnxOpType.Sum.name: self.convert_eltwise,
+            OnnxOpType.SumGroup.name: self.convert_sum_group,
             OnnxOpType.Tanh.name: self.convert_activation,
+            OnnxOpType.TargetRMSNorm: self.convert_target_rms_norm,
             OnnxOpType.Transpose.name: self.convert_transpose,
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
-        ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW)
+        self._data_format = DataFormat.NCHW
+        ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
         onnx_model = onnx.load(src_model_file)
 
-        polished_model = onnx.utils.polish_model(onnx_model)
-
-        print "onnx model IR version: ", onnx_model.ir_version
-        print "onnx model opset import: ", onnx_model.opset_import
-
-        self._onnx_model = shape_inference.infer_shapes(polished_model)
+        ir_version = onnx_model.ir_version
+        opset_imp = onnx_model.opset_import
+
+        polish_available = True
+        print("onnx model IR version: ", ir_version)
+        for imp in opset_imp:
+            domain = imp.domain
+            version = imp.version
+            print("constains ops domain: ", domain, "version:", version)
+            if 'kaldi2onnx' in domain:
+                polish_available = False
+                self._data_format = DataFormat.DF_NONE
+        if polish_available:
+            onnx_model = onnx.utils.polish_model(onnx_model)
+
+        self._onnx_model = onnx_model
         self._graph_shapes_dict = {}
         self._consts = {}
         self._replace_tensors = {}
 
-    def print_graph_info(self, graph):
+    @staticmethod
+    def print_graph_info(graph):
         for value_info in graph.value_info:
-            print "value info:", value_info
+            print("value info:", value_info)
         for value_info in graph.input:
-            print "inputs info:", value_info
+            print("inputs info:", value_info)
         for value_info in graph.output:
-            print "outputs info:", value_info
+            print("outputs info:", value_info)
 
     def extract_shape_info(self, graph):
         def extract_value_info(shape_dict, value_info):
@@ -368,12 +410,12 @@ class OnnxConverter(base_converter.ConverterInterface):
             if t:
                 shape_dict[value_info.name] = t
 
-        for value_info in graph.value_info:
-            extract_value_info(self._graph_shapes_dict, value_info)
-        for value_info in graph.input:
-            extract_value_info(self._graph_shapes_dict, value_info)
-        for value_info in graph.output:
-            extract_value_info(self._graph_shapes_dict, value_info)
+        for vi in graph.value_info:
+            extract_value_info(self._graph_shapes_dict, vi)
+        for vi in graph.input:
+            extract_value_info(self._graph_shapes_dict, vi)
+        for vi in graph.output:
+            extract_value_info(self._graph_shapes_dict, vi)
 
     def add_tensor(self, name, shape, data_type, value):
         tensor = self._mace_net_def.tensors.add()
@@ -387,11 +429,6 @@ class OnnxConverter(base_converter.ConverterInterface):
         self.extract_shape_info(graph_def)
         self.convert_tensors(graph_def)
         self.convert_ops(graph_def)
-        # self.print_graph_info(graph_def)
-        # shape_inferer = mace_shape_inference.ShapeInference(
-        #     self._mace_net_def,
-        #     self._option.input_nodes.values())
-        # shape_inferer.run()
         return self._mace_net_def
 
     def add_stride_pad_kernel_arg(self, attrs, op_def):
@@ -435,6 +472,32 @@ class OnnxConverter(base_converter.ConverterInterface):
             padding_arg.name = MaceKeyword.mace_padding_values_str
             padding_arg.ints.extend(pad)
 
+    def remove_node(self, node):
+        input_name = node.inputs[0]
+        output_name = node.outputs[0]
+        self._replace_tensors[output_name] = input_name
+
+    @staticmethod
+    def squeeze_shape(shape, axis):
+        new_shape = []
+        if len(axis) > 0:
+            for i in range(len(shape)):
+                if i not in axis:
+                    new_shape.append(shape[i])
+        else:
+            new_shape = shape
+        return new_shape
+
+    @staticmethod
+    def transpose_const(tensor):
+        shape = tensor.dims
+        mace_check(len(shape) == 2, "gemm only supports 2-dim input.")
+        tensor_data = np.array(tensor.float_data).reshape(
+            shape[0], shape[1])
+        tensor_data = tensor_data.transpose(1, 0)
+        tensor.float_data[:] = tensor_data.flat
+        tensor.dims[:] = tensor_data.shape
+
     def convert_ops(self, graph_def):
         for n in graph_def.node:
             node = OnnxNode(n)
@@ -471,7 +534,7 @@ class OnnxConverter(base_converter.ConverterInterface):
                                "Not supported tensor type: %s" % data_type)
                 self._consts[tensor.name] = tensor
 
-    def convert_general_op(self, node):
+    def convert_general_op(self, node, with_shape=True):
         op = self._mace_net_def.op.add()
         op.name = node.name
 
@@ -481,9 +544,11 @@ class OnnxConverter(base_converter.ConverterInterface):
             op.input.append(input)
         for output in node.outputs:
             op.output.append(output)
-            output_shape = op.output_shape.add()
-            shape_info = self._graph_shapes_dict[output]
-            output_shape.dims.extend(shape_info)
+            if with_shape:
+                if output in self._graph_shapes_dict:
+                    output_shape = op.output_shape.add()
+                    shape_info = self._graph_shapes_dict[output]
+                    output_shape.dims.extend(shape_info)
 
         data_type_arg = op.arg.add()
         data_type_arg.name = 'T'
@@ -493,91 +558,9 @@ class OnnxConverter(base_converter.ConverterInterface):
         framework_type_arg.name = MaceKeyword.mace_framework_type_str
         framework_type_arg.i = FrameworkType.ONNX.value
 
-        ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+        ConverterUtil.add_data_format_arg(op, self._data_format)
         return op
 
-    def convert_fused_batchnorm(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.BatchNorm.name
-
-        if "epsilon" in node.attrs:
-            epsilon_value = node.attrs["epsilon"]
-        else:
-            epsilon_value = 1e-5
-
-        mace_check(len(node.inputs) == 5, "batch norm should have 5 inputs.")
-
-        gamma_value = np.array(self._consts[node.inputs[1]].float_data)
-        beta_value = np.array(self._consts[node.inputs[2]].float_data)
-        mean_value = np.array(self._consts[node.inputs[3]].float_data)
-        var_value = np.array(self._consts[node.inputs[4]].float_data)
-
-        scale_name = node.name + 'scale'
-        offset_name = node.name + 'offset'
-        scale_value = (
-                (1.0 / np.sqrt(
-                    var_value + epsilon_value)) * gamma_value)
-        offset_value = (-mean_value * scale_value) + beta_value
-        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
-                        scale_value)
-        self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT,
-                        offset_value)
-        del op.input[1:]
-        op.input.extend([scale_name, offset_name])
-        del op.output[1:]
-        del op.output_shape[1:]
-
-    def convert_conv2d(self, node):
-        op = self.convert_general_op(node)
-        self.add_stride_pad_kernel_arg(node.attrs, op)
-        group_arg = op.arg.add()
-        group_arg.name = MaceKeyword.mace_group_str
-        if 'group' in node.attrs:
-            group_val = node.attrs["group"]
-        else:
-            group_val = 1
-        group_arg.i = group_val
-
-        is_depthwise = False
-        if group_val > 1:
-            filter_shape = self._graph_shapes_dict[node.inputs[1]]
-            mace_check(group_val == filter_shape[0] and
-                       filter_shape[1] == 1,
-                       "Mace does not support group convolution yet")
-            filter_tensor = self._consts[node.inputs[1]]
-            new_shape = [filter_shape[1], filter_shape[0],
-                         filter_shape[2], filter_shape[3]]
-            del filter_tensor.dims[:]
-            filter_tensor.dims.extend(new_shape)
-            is_depthwise = True
-        if is_depthwise:
-            op.type = MaceOp.DepthwiseConv2d.name
-        else:
-            op.type = MaceOp.Conv2D.name
-
-        dilation_arg = op.arg.add()
-        dilation_arg.name = MaceKeyword.mace_dilations_str
-        if 'dilations' in node.attrs:
-            dilation_val = node.attrs["dilations"]
-        else:
-            dilation_val = [1, 1]
-        dilation_arg.ints.extend(dilation_val)
-
-    def convert_biasadd(self, node):
-        self.convert_general_op(node)
-
-    def convert_concat(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Concat.name
-        mace_check('axis' in node.attrs,
-                   'Concat op should have axis attribute.')
-        axis_arg = op.arg.add()
-        axis_arg.name = MaceKeyword.mace_axis_str
-        axis_arg.i = node.attrs['axis']
-        axis_arg.i = 4 + axis_arg.i if axis_arg.i < 0 else axis_arg.i
-        mace_check(axis_arg.i == 1,
-                   "only support concat at channel dimension")
-
     def convert_activation(self, node):
         op = self.convert_general_op(node)
         op.type = MaceOp.Activation.name
@@ -597,100 +580,12 @@ class OnnxConverter(base_converter.ConverterInterface):
         alpha_arg.name = MaceKeyword.mace_activation_max_limit_str
         alpha_arg.f = alpha_value
 
-    def convert_pooling(self, node):
-        op = self.convert_general_op(node)
-
-        op.type = MaceOp.Pooling.name
-        self.add_stride_pad_kernel_arg(node.attrs, op)
-        pooling_type_arg = op.arg.add()
-        pooling_type_arg.name = MaceKeyword.mace_pooling_type_str
-        pooling_type_arg.i = self.pooling_type_mode[node.op_type].value
-
-        round_mode_arg = op.arg.add()
-        round_mode_arg.name = MaceKeyword.mace_round_mode_str
-        round_mode_arg.i = RoundMode.FLOOR.value
-
-    def convert_reshape(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Reshape.name
-
-    def convert_flatten(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Reshape.name
-
-    def remove_node(self, node):
-        input_name = node.inputs[0]
-        output_name = node.outputs[0]
-        self._replace_tensors[output_name] = input_name
-
-    def convert_eltwise(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Eltwise.name
-        type_arg = op.arg.add()
-        type_arg.name = MaceKeyword.mace_element_type_str
-        type_arg.i = self.eltwise_type[node.op_type].value
-
-        if node.op_type == OnnxOpType.Sqrt.name:
-            value_arg = op.arg.add()
-            value_arg.name = MaceKeyword.mace_scalar_input_str
-            value_arg.f = 0.5
-        elif node.op_type == OnnxOpType.Reciprocal.name:
-            value_arg = op.arg.add()
-            value_arg.name = MaceKeyword.mace_scalar_input_str
-            value_arg.f = -1
-
-    def convert_reduce(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Reduce.name
-
-        reduce_type_arg = op.arg.add()
-        reduce_type_arg.name = MaceKeyword.mace_reduce_type_str
-        reduce_type_arg.i = self.reduce_type[node.op_type].value
-
-        if node.op_type in [OnnxOpType.GlobalAveragePool.name,
-                            OnnxOpType.GlobalMaxPool.name]:
-            reduce_dims = [2, 3]
-            keep_dims = 1
-        else:
-            if 'axes' in node.attrs:
-                reduce_dims = node.attrs['axes']
-            else:
-                reduce_dims = []
-            if 'keepdims' in node.attrs:
-                keep_dims = node.attrs['keepdims']
-            else:
-                keep_dims = 1
-        axis_arg = op.arg.add()
-        axis_arg.name = MaceKeyword.mace_axis_str
-        axis_arg.ints.extend(reduce_dims)
-
-        keep_dims_arg = op.arg.add()
-        keep_dims_arg.name = MaceKeyword.mace_keepdims_str
-        keep_dims_arg.i = keep_dims
-
-    def convert_imagescaler(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.BatchNorm.name
-
-        scale = node.attrs['scale']
-        bias_value = np.array(node.attrs['bias'])
-        scale_value = scale * np.ones_like(bias_value)
-
-        scale_name = node.name + "_scale"
-        bias_name = node.name + "_bias"
-        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
-                        scale_value)
-        self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT,
-                        bias_value)
-        op.input.extend([scale_name, bias_name])
-
-    def convert_matmul(self, node):
+    def convert_affine(self, node):
         op = self.convert_general_op(node)
         op.type = MaceOp.MatMul.name
-
-    def convert_softmax(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Softmax.name
+        transpose_b_arg = op.arg.add()
+        transpose_b_arg.name = MaceKeyword.mace_transpose_b_str
+        transpose_b_arg.i = 1
 
     def convert_argmax(self, node):
         op = self.convert_general_op(node)
@@ -717,6 +612,10 @@ class OnnxConverter(base_converter.ConverterInterface):
             min_arg.name = MaceKeyword.mace_argmin_str
             min_arg.i = 1
 
+    def convert_biasadd(self, node):
+        self.convert_general_op(node)
+        op.type = MaceOp.BiasAdd.name
+
     def convert_cast(self, node):
         op = self.convert_general_op(node)
         op.type = MaceOp.Cast.name
@@ -732,41 +631,51 @@ class OnnxConverter(base_converter.ConverterInterface):
         else:
             op.output_type.extend([self._option.data_type])
 
-    def convert_depth_space(self, node):
+    def convert_concat(self, node):
         op = self.convert_general_op(node)
-        if op.type == OnnxOpType.DepthToSpace.name:
-            op.type = MaceOp.DepthToSpace.name
-        else:
-            op.type = MaceOp.SpaceToDepth.name
-        mace_check(('block_size' in node.attrs),
-                   "depth to space op should have block size attribute.")
-        block_size = node.attrs['block_size']
-        size_arg = op.arg.add()
-        size_arg.name = MaceKeyword.mace_space_depth_block_size_str
-        size_arg.i = block_size
+        op.type = MaceOp.Concat.name
+        axis_value = 1
+        if node.op_type == OnnxOpType.Concat.name:
+            mace_check('axis' in node.attrs,
+                       'Concat op should have axis attribute.')
+            axis_value = node.attrs['axis']
+            mace_check(axis_value == 1 or axis_value == -3,
+                       "only support concat at channel dimension")
+        elif node.op_type == OnnxOpType.Append.name:
+            axis_value = 2
+        axis_arg = op.arg.add()
+        axis_arg.name = MaceKeyword.mace_axis_str
+        axis_arg.i = 4 + axis_value if axis_value < 0 else axis_value
 
-    def convert_deconv(self, node):
+    def convert_conv2d(self, node):
         op = self.convert_general_op(node)
-
         self.add_stride_pad_kernel_arg(node.attrs, op)
-
+        group_arg = op.arg.add()
+        group_arg.name = MaceKeyword.mace_group_str
         if 'group' in node.attrs:
             group_val = node.attrs["group"]
         else:
             group_val = 1
+        group_arg.i = group_val
+
+        is_depthwise = False
         if group_val > 1:
-            op.type = MaceOp.DepthwiseDeconv2d.name
             filter_shape = self._graph_shapes_dict[node.inputs[1]]
+            mace_check(group_val == filter_shape[0] and
+                       filter_shape[1] == 1,
+                       "Mace does not support group convolution yet")
             filter_tensor = self._consts[node.inputs[1]]
             new_shape = [filter_shape[1], filter_shape[0],
                          filter_shape[2], filter_shape[3]]
             del filter_tensor.dims[:]
             filter_tensor.dims.extend(new_shape)
+            is_depthwise = True
+        if is_depthwise:
+            op.type = MaceOp.DepthwiseConv2d.name
         else:
-            op.type = MaceOp.Deconv2D.name
-        group_arg = op.arg.add()
-        group_arg.name = MaceKeyword.mace_group_str
-        group_arg.i = group_val
+            op.type = MaceOp.Conv2D.name
+            mace_check(op.input[1] in self._consts,
+                       "Mace does not support non-const filter convolution.")
 
         dilation_arg = op.arg.add()
         dilation_arg.name = MaceKeyword.mace_dilations_str
@@ -775,16 +684,47 @@ class OnnxConverter(base_converter.ConverterInterface):
         else:
             dilation_val = [1, 1]
         dilation_arg.ints.extend(dilation_val)
-        mace_check(dilation_val == [1, 1],
-                   "not support convtranspose with dilation != 1 yet.")
 
-        mace_check('output_padding' not in node.attrs,
-                   "not support convtranspose with output_padding yet.")
-        mace_check('output_shape' not in node.attrs,
-                   "not support convtranspose with output_shape yet.")
-        # TODO: if output shape specified, calculate padding value
-        # if 'output_padding' in node.attrs:
-        #     output_padding = node.attrs['output_padding']
+    def convert_deconv(self, node):
+        op = self.convert_general_op(node)
+
+        self.add_stride_pad_kernel_arg(node.attrs, op)
+
+        if 'group' in node.attrs:
+            group_val = node.attrs["group"]
+        else:
+            group_val = 1
+        if group_val > 1:
+            op.type = MaceOp.DepthwiseDeconv2d.name
+            filter_shape = self._graph_shapes_dict[node.inputs[1]]
+            filter_tensor = self._consts[node.inputs[1]]
+            new_shape = [filter_shape[1], filter_shape[0],
+                         filter_shape[2], filter_shape[3]]
+            del filter_tensor.dims[:]
+            filter_tensor.dims.extend(new_shape)
+        else:
+            op.type = MaceOp.Deconv2D.name
+        group_arg = op.arg.add()
+        group_arg.name = MaceKeyword.mace_group_str
+        group_arg.i = group_val
+
+        dilation_arg = op.arg.add()
+        dilation_arg.name = MaceKeyword.mace_dilations_str
+        if 'dilations' in node.attrs:
+            dilation_val = node.attrs["dilations"]
+        else:
+            dilation_val = [1, 1]
+        dilation_arg.ints.extend(dilation_val)
+        mace_check(dilation_val == [1, 1],
+                   "not support convtranspose with dilation != 1 yet.")
+
+        mace_check('output_padding' not in node.attrs,
+                   "not support convtranspose with output_padding yet.")
+        mace_check('output_shape' not in node.attrs,
+                   "not support convtranspose with output_shape yet.")
+        # TODO: if output shape specified, calculate padding value
+        # if 'output_padding' in node.attrs:
+        #     output_padding = node.attrs['output_padding']
         #     output_padding_arg = op.arg.add()
         #     output_padding_arg.name = MaceKeyword.mace_output_padding_str
         #     output_padding_arg.ints.extend(output_padding)
@@ -794,43 +734,98 @@ class OnnxConverter(base_converter.ConverterInterface):
         #     output_shape_arg.name = MaceKeyword.mace_output_shape_str
         #     output_shape_arg.ints.extend(output_shape)
 
-    def convert_nop(self, node):
-        pass
+    def convert_depth_space(self, node):
+        op = self.convert_general_op(node)
+        if op.type == OnnxOpType.DepthToSpace.name:
+            op.type = MaceOp.DepthToSpace.name
+        else:
+            op.type = MaceOp.SpaceToDepth.name
+        mace_check(('block_size' in node.attrs),
+                   "depth to space op should have block size attribute.")
+        block_size = node.attrs['block_size']
+        size_arg = op.arg.add()
+        size_arg.name = MaceKeyword.mace_space_depth_block_size_str
+        size_arg.i = block_size
 
-    def convert_identity(self, node):
+    def convert_dim_range(self, node):
         op = self.convert_general_op(node)
-        op.type = MaceOp.Identity.name
+        op.type = MaceOp.Slice.name
+
+        mace_check('offset' in node.attrs,
+                   "Attribute dim required!")
+        mace_check('output_dim' in node.attrs,
+                   "Attribute output_dim required!")
+        offset = node.attrs['offset']
+        starts_arg = op.arg.add()
+        starts_arg.name = 'starts'
+        starts_arg.ints.append(offset)
+        output_dim = node.attrs['output_dim']
+        ends_arg = op.arg.add()
+        ends_arg.name = 'output_dim'
+        ends_arg.ints.append(output_dim)
+        axes_arg = op.arg.add()
+        axes_arg.name = 'axes'
+        axes_arg.ints.append(-1)
 
-    def convert_pad(self, node):
+    def convert_eltwise(self, node):
         op = self.convert_general_op(node)
-        op.type = MaceOp.Pad.name
+        op.type = MaceOp.Eltwise.name
+        type_arg = op.arg.add()
+        type_arg.name = MaceKeyword.mace_element_type_str
+        type_arg.i = self.eltwise_type[node.op_type].value
 
-        if 'pads' in node.attrs:
-            paddings_arg = op.arg.add()
-            paddings_arg.name = MaceKeyword.mace_paddings_str
-            paddings_value = node.attrs['pads']
-            paddings_arg.ints.extend(paddings_value)
+        if node.op_type == OnnxOpType.Sqrt.name:
+            value_arg = op.arg.add()
+            value_arg.name = MaceKeyword.mace_scalar_input_str
+            value_arg.f = 0.5
+        elif node.op_type == OnnxOpType.Reciprocal.name:
+            value_arg = op.arg.add()
+            value_arg.name = MaceKeyword.mace_scalar_input_str
+            value_arg.f = -1
+        elif node.op_type == OnnxOpType.Scale.name and 'scale' in node.attrs:
+            value = node.attrs['scale']
+            value_arg = op.arg.add()
+            value_arg.name = MaceKeyword.mace_scalar_input_str
+            value_arg.f = value
 
-        if 'value' in node.attrs:
-            constant_value_arg = op.arg.add()
-            constant_value_arg.name = MaceKeyword.mace_constant_value_str
-            constant_value_arg.i = node.attrs['value']
+    def convert_flatten(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Reshape.name
 
-    def convert_gather(self, node):
+    def convert_fused_batchnorm(self, node):
         op = self.convert_general_op(node)
-        op.type = MaceOp.Gather.name
+        op.type = MaceOp.BatchNorm.name
 
-        if 'axis' in node.attrs:
-            value = node.attrs['axis']
+        if "epsilon" in node.attrs:
+            epsilon_value = node.attrs["epsilon"]
         else:
-            value = 0
-        axis_arg = op.arg.add()
-        axis_arg.name = MaceKeyword.mace_axis_str
-        axis_arg.i = value
+            epsilon_value = 1e-5
 
-    def convert_split(self, node):
+        mace_check(len(node.inputs) == 5, "batch norm should have 5 inputs.")
+
+        gamma_value = np.array(self._consts[node.inputs[1]].float_data)
+        beta_value = np.array(self._consts[node.inputs[2]].float_data)
+        mean_value = np.array(self._consts[node.inputs[3]].float_data)
+        var_value = np.array(self._consts[node.inputs[4]].float_data)
+
+        scale_name = node.name + 'scale'
+        offset_name = node.name + 'offset'
+        scale_value = (
+                (1.0 / np.sqrt(
+                    var_value + epsilon_value)) * gamma_value)
+        offset_value = (-mean_value * scale_value) + beta_value
+        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
+                        scale_value)
+        self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT,
+                        offset_value)
+        del op.input[1:]
+        op.input.extend([scale_name, offset_name])
+        del op.output[1:]
+        del op.output_shape[1:]
+
+    def convert_gather(self, node):
         op = self.convert_general_op(node)
-        op.type = MaceOp.Split.name
+        op.type = MaceOp.Gather.name
 
         if 'axis' in node.attrs:
             value = node.attrs['axis']
@@ -840,64 +835,6 @@ class OnnxConverter(base_converter.ConverterInterface):
         axis_arg.name = MaceKeyword.mace_axis_str
         axis_arg.i = value
 
-    def convert_transpose(self, node):
-        op = self.convert_general_op(node)
-        op.type = MaceOp.Transpose.name
-
-        if np.array_equal(perm, ordered_perm):
-            op.type = MaceOp.Identity.name
-            del op.input[1:]
-        if 'perm' in node.attrs:
-            perm = node.attrs['perm']
-            ordered_perm = np.sort(perm)
-            if np.array_equal(perm, ordered_perm):
-                op.type = MaceOp.Identity.name
-            else:
-                dims_arg = op.arg.add()
-                dims_arg.name = MaceKeyword.mace_dims_str
-                dims_arg.ints.extend(perm)
-
-    @staticmethod
-    def squeeze_shape(shape, axis):
-        new_shape = []
-        if len(axis) > 0:
-            for i in range(len(shape)):
-                if i not in axis:
-                    new_shape.append(shape[i])
-        else:
-            new_shape = shape
-        return new_shape
-
-    def convert_squeeze(self, node):
-        axis_value = node.attrs['axes']
-        if node.inputs[0] in self._consts:
-            tensor = self._consts[node.inputs[0]]
-            shape = tensor.dims
-            new_shape = self.squeeze_shape(shape, axis_value)
-            del tensor.dims[:]
-            tensor.dims.extend(new_shape)
-            self.remove_node(node)
-        else:
-            op = self.convert_general_op(node)
-            op.type = MaceOp.Squeeze.name
-            axis_arg = op.arg.add()
-            axis_arg.name = MaceKeyword.mace_axis_str
-            if 'axis' in node.attrs:
-                axis_value = node.attrs['axis']
-            else:
-                axis_value = []
-            axis_arg.ints.extend(axis_value)
-
-    @staticmethod
-    def transpose_const(tensor):
-        shape = tensor.dims
-        mace_check(len(shape) == 2, "gemm only supports 2-dim input.")
-        tensor_data = np.array(tensor.float_data).reshape(
-            shape[0], shape[1])
-        tensor_data = tensor_data.transpose(1, 0)
-        tensor.float_data[:] = tensor_data.flat
-        tensor.dims[:] = tensor_data.shape
-
     def convert_gemm(self, node):
         # only supports FullyConnected Style Gemm for now.
         trans_a = node.attrs['transA'] if 'transA' in node.attrs else 0
@@ -915,7 +852,7 @@ class OnnxConverter(base_converter.ConverterInterface):
         elif len(shape_b) == 2:
             tensor_b = self._consts[node.inputs[1]]
             tensor_data = np.array(tensor_b.float_data).reshape(
-                    shape_b[0], shape_b[1], 1, 1)
+                shape_b[0], shape_b[1], 1, 1)
             tensor_b.float_data[:] = tensor_data.flat
             tensor_b.dims[:] = tensor_data.shape
         else:
@@ -949,4 +886,224 @@ class OnnxConverter(base_converter.ConverterInterface):
                 shape_info = [shape_info[0], shape_info[1], 1, 1]
             output_shape.dims.extend(shape_info)
 
-        return op
+    def convert_identity(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Identity.name
+
+    def convert_imagescaler(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.BatchNorm.name
+
+        scale = node.attrs['scale']
+        bias_value = np.array(node.attrs['bias'])
+        scale_value = scale * np.ones_like(bias_value)
+
+        scale_name = node.name + "_scale"
+        bias_name = node.name + "_bias"
+        self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
+                        scale_value)
+        self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT,
+                        bias_value)
+        op.input.extend([scale_name, bias_name])
+
+    def convert_lstm(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.LSTMCell.name
+
+    def convert_lstm_nonlinear(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.LstmNonlinear.name
+
+    def convert_matmul(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.MatMul.name
+
+    def convert_nop(self, node):
+        pass
+
+    def convert_normalize(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.BatchNorm.name
+
+    def convert_pnorm(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.PNorm.name
+        if 'output_dim' in node.attrs:
+            output_dim_arg = op.arg.add()
+            output_dim_arg.name = 'output_dim'
+            output_dim_arg.i = node.attrs['output_dim']
+        if 'p' in node.attrs:
+            p_value = node.attrs['p']
+            mace_check((p_value >= 0) and (p_value <= 2),
+                       "PNorm only supports p = 0, 1, 2")
+            p_arg = op.arg.add()
+            p_arg.name = 'p'
+            p_arg.i = p_value
+
+    def convert_pooling(self, node):
+        op = self.convert_general_op(node)
+
+        op.type = MaceOp.Pooling.name
+        self.add_stride_pad_kernel_arg(node.attrs, op)
+        pooling_type_arg = op.arg.add()
+        pooling_type_arg.name = MaceKeyword.mace_pooling_type_str
+        pooling_type_arg.i = self.pooling_type_mode[node.op_type].value
+
+        round_mode_arg = op.arg.add()
+        round_mode_arg.name = MaceKeyword.mace_round_mode_str
+        round_mode_arg.i = RoundMode.FLOOR.value
+
+    def convert_reduce(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Reduce.name
+
+        reduce_type_arg = op.arg.add()
+        reduce_type_arg.name = MaceKeyword.mace_reduce_type_str
+        reduce_type_arg.i = self.reduce_type[node.op_type].value
+
+        if node.op_type in [OnnxOpType.GlobalAveragePool.name,
+                            OnnxOpType.GlobalMaxPool.name]:
+            reduce_dims = [2, 3]
+            keep_dims = 1
+        else:
+            if 'axes' in node.attrs:
+                reduce_dims = node.attrs['axes']
+            else:
+                reduce_dims = []
+            if 'keepdims' in node.attrs:
+                keep_dims = node.attrs['keepdims']
+            else:
+                keep_dims = 1
+        axis_arg = op.arg.add()
+        axis_arg.name = MaceKeyword.mace_axis_str
+        axis_arg.ints.extend(reduce_dims)
+
+        keep_dims_arg = op.arg.add()
+        keep_dims_arg.name = MaceKeyword.mace_keepdims_str
+        keep_dims_arg.i = keep_dims
+
+    def convert_reshape(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Reshape.name
+
+    def convert_slice(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Slice.name
+
+        mace_check('starts' in node.attrs, "Attribute starts required!")
+        mace_check('ends' in node.attrs, "Attribute ends required!")
+        starts = node.attrs['starts']
+        starts_arg = op.arg.add()
+        starts_arg.name = 'starts'
+        starts_arg.ints.extend(starts)
+        ends = node.attrs['ends']
+        ends_arg = op.arg.add()
+        ends_arg.name = 'ends'
+        ends_arg.ints.extend(ends)
+        if 'axes' in node.attrs:
+            axes = node.attrs['axes']
+            axes_arg = op.arg.add()
+            axes_arg.name = 'axes'
+            axes_arg.ints.extend(axes)
+
+    def convert_softmax(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Softmax.name
+        # TODO: add logsoftmax in softmax op
+        # if node.op_type == OnnxOpType.LogSoftmax.name:
+        #     use_log_arg = op.arg.add()
+        #     use_log_arg.name = 'use_log'
+        #     use_log_arg.i = 1
+
+    def convert_splice(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Splice.name
+        if 'context' in node.attrs:
+            context = node.attrs['context']
+        else:
+            context = [0]
+        context_arg = op.arg.add()
+        context_arg.name = 'context'
+        context_arg.ints.extend(context)
+        if 'const_component_dim' in node.attrs:
+            const_dim = node.attrs['const_component_dim']
+        else:
+            const_dim = 0
+        const_dim_arg = op.arg.add()
+        const_dim_arg.name = 'const_component_dim'
+        const_dim_arg.i = const_dim
+
+    def convert_split(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Split.name
+
+        if 'axis' in node.attrs:
+            value = node.attrs['axis']
+        else:
+            value = 0
+        axis_arg = op.arg.add()
+        axis_arg.name = MaceKeyword.mace_axis_str
+        axis_arg.i = value
+
+    def convert_squeeze(self, node):
+        axis_value = node.attrs['axes']
+        if node.inputs[0] in self._consts:
+            tensor = self._consts[node.inputs[0]]
+            shape = tensor.dims
+            new_shape = self.squeeze_shape(shape, axis_value)
+            del tensor.dims[:]
+            tensor.dims.extend(new_shape)
+            self.remove_node(node)
+        else:
+            op = self.convert_general_op(node)
+            op.type = MaceOp.Squeeze.name
+            axis_arg = op.arg.add()
+            axis_arg.name = MaceKeyword.mace_axis_str
+            if 'axis' in node.attrs:
+                axis_value = node.attrs['axis']
+            else:
+                axis_value = []
+            axis_arg.ints.extend(axis_value)
+
+    def convert_sum_group(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.SumGroup.name
+
+    def convert_target_rms_norm(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.TargetRMSNorm.name
+
+        if 'target_rms' in node.attrs:
+            value = node.attrs['target_rms']
+            target_rms_arg = op.arg.add()
+            target_rms_arg.name = 'target_rms'
+            target_rms_arg.f = value
+
+    def convert_transpose(self, node):
+        op = self.convert_general_op(node)
+        op.type = MaceOp.Transpose.name
+
+        if 'perm' in node.attrs:
+            perm = node.attrs['perm']
+            ordered_perm = np.sort(perm)
+            if np.array_equal(perm, ordered_perm):
+                op.type = MaceOp.Identity.name
+                del op.input[1:]
+            else:
+                dims_arg = op.arg.add()
+                dims_arg.name = MaceKeyword.mace_dims_str
+                dims_arg.ints.extend(perm)
+
+    def convert_timeoffset(self, node):
+        op = self.convert_general_op(node)
+        mace_check('offset' in node.attrs,
+                   'Offset attribute required in Offset Node.')
+        offset = node.attrs['offset']
+        if offset == 0:
+            op.type = MaceOp.Identity.name
+        else:
+            op.type = MaceOp.TimeOffset.name
+
+        offset_arg = op.arg.add()
+        offset_arg.name = 'offset'
+        offset_arg.i = offset
diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py
index 3e472216efa3651a663a32ee2db729497d059ff2..45254333915250c9366add9de94f626a3f6f5e65 100644
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
@@ -20,7 +20,6 @@ import six
 
 from mace.python.tools.converter_tool.transformer import Transformer
 from mace.python.tools.converter_tool.base_converter import DataFormat
-from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
@@ -52,6 +51,7 @@ class ShapeInference(object):
             MaceOp.Transpose.name: self.infer_shape_permute,
             MaceOp.PriorBox.name: self.infer_shape_prior_box,
             MaceOp.Reshape.name: self.infer_shape_reshape,
+            MaceOp.ResizeBilinear.name: self.infer_shape_resize_bilinear,
         }
 
         self._net = net
@@ -129,7 +129,7 @@ class ShapeInference(object):
 
         output_shape[0] = input_shape[0]
         if ConverterUtil.data_format(op) == DataFormat.NCHW \
-                and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW:  # noqa
+                and ConverterUtil.filter_format(self._net) == DataFormat.OIHW:  # noqa
             # filter format: OIHW
             if op.type == MaceOp.DepthwiseConv2d.name:
                 output_shape[1] = filter_shape[0] * filter_shape[1]
@@ -170,7 +170,7 @@ class ShapeInference(object):
                                           MaceKeyword.mace_group_str)
         output_shape[0] = input_shape[0]
         if ConverterUtil.data_format(op) == DataFormat.NCHW \
-                and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW:  # noqa
+                and ConverterUtil.filter_format(self._net) == DataFormat.OIHW:  # noqa
             # filter format: IOHW
             output_shape[1] = filter_shape[1]
             if group_arg is not None and group_arg.i > 1:
@@ -224,7 +224,12 @@ class ShapeInference(object):
 
     def infer_shape_crop(self, op):
         mace_check(len(op.input) == 2, "crop layer needs two inputs")
-        output_shape = self._output_shape_cache[op.input[1]]
+        output_shape = self._output_shape_cache[op.input[0]]
+        input1_shape = self._output_shape_cache[op.input[1]]
+        offsets = ConverterUtil.get_arg(op, MaceKeyword.mace_offset_str).ints
+        for i in range(len(offsets)):
+            if offsets[i] >= 0:
+                output_shape[i] = input1_shape[i]
         self.add_output_shape(op, [output_shape])
 
     def infer_shape_channel_shuffle(self, op):
@@ -289,3 +294,17 @@ class ShapeInference(object):
                 output_shape.append(self._output_shape_cache[op.input[0]][i])
             output_shape[axis] = dim
             self.add_output_shape(op, [output_shape])
+
+    def infer_shape_resize_bilinear(self, op):
+        input_shape = self._output_shape_cache[op.input[0]]
+        size = ConverterUtil.get_arg(
+            op, MaceKeyword.mace_resize_size_str).ints
+        if ConverterUtil.data_format(op) == DataFormat.NCHW:
+            output_shape = [input_shape[0], input_shape[1], size[0], size[1]]
+        elif ConverterUtil.data_format(op) == DataFormat.NHWC:
+            output_shape = [input_shape[0], size[0], size[1], input_shape[3]]
+        else:
+            output_shape = []
+            mace_check(False, "format %s is not supported"
+                       % ConverterUtil.data_format(op))
+        self.add_output_shape(op, [output_shape])
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 3a7bf380e132c303edfa3a98c75f4fdab54d82e2..ec255e3a90296a04d8538c1ff464edb097fe5193 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import os
 import math
 import numpy as np
 import six
@@ -29,7 +29,6 @@ from mace.python.tools.converter_tool.base_converter import PadType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
 from mace.python.tools.converter_tool.base_converter import ReduceType
 from mace.python.tools.converter_tool.base_converter import DataFormat
-from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
@@ -117,6 +116,7 @@ TFSupportedOps = [
     'FloorDiv',
     'Sqrt',
     'MirrorPad',
+    'Cumsum',
     'OneHot',
 ]
 
@@ -124,39 +124,16 @@ TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)
 
 TFSupportedOps = [six.b(op) for op in TFSupportedOps]
 
-TFTransformGraphOptions = {
-    base_converter.DeviceType.CPU.value: [
-        'strip_unused_nodes',
-        'remove_nodes(op=Identity, op=CheckNumerics)',
-        'fold_constants(ignore_errors=true)',
-        'fold_batch_norms',
-        'fold_old_batch_norms',
-        'remove_control_dependencies',
-        'strip_unused_nodes',
-        'sort_by_execution_order'
-    ],
-    base_converter.DeviceType.GPU.value: [
-        'strip_unused_nodes',
-        'remove_nodes(op=Identity, op=CheckNumerics)',
-        'fold_constants(ignore_errors=true)',
-        'flatten_atrous_conv',
-        'fold_batch_norms',
-        'fold_old_batch_norms',
-        'remove_control_dependencies',
-        'strip_unused_nodes',
-        'sort_by_execution_order'
-    ],
-    base_converter.DeviceType.HEXAGON.value: [
-        'strip_unused_nodes',
-        'remove_nodes(op=Identity, op=CheckNumerics)',
-        'fold_constants(ignore_errors=true)',
-        'fold_batch_norms',
-        'fold_old_batch_norms',
-        'remove_control_dependencies',
-        'strip_unused_nodes',
-        'sort_by_execution_order'
-    ]
-}
+TFTransformGraphOptions = [
+    'strip_unused_nodes',
+    'remove_nodes(op=Identity, op=CheckNumerics)',
+    'fold_constants(ignore_errors=true)',
+    'fold_batch_norms',
+    'fold_old_batch_norms',
+    'remove_control_dependencies',
+    'strip_unused_nodes',
+    'sort_by_execution_order'
+]
 
 
 class TensorflowConverter(base_converter.ConverterInterface):
@@ -278,11 +255,12 @@ class TensorflowConverter(base_converter.ConverterInterface):
             TFOpType.FloorDiv.name: self.convert_elementwise,
             TFOpType.Sqrt.name: self.convert_elementwise,
             TFOpType.MirrorPad.name: self.convert_pad,
+            TFOpType.Cumsum.name: self.convert_cumsum,
             TFOpType.OneHot.name: self.convert_one_hot,
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
-        ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.HWIO)
+        ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO)
 
         # import tensorflow graph
         tf_graph_def = tf.GraphDef()
@@ -290,29 +268,44 @@ class TensorflowConverter(base_converter.ConverterInterface):
             tf_graph_def.ParseFromString(f.read())
 
         self._placeholders = {}
-        self.add_shape_info(tf_graph_def)
+        self._skip_tensor = set()
+        self._output_shape = {}
 
-        print("Run transform_graph: %s" % TFTransformGraphOptions[
-            option.device])
+        print("Run transform_graph: %s" % TFTransformGraphOptions)
         try:
-            print ("output keys: ", option.output_nodes.keys())
+            print("output keys: ", option.output_nodes.keys())
             transformed_graph_def = TransformGraph(tf_graph_def,
                                                    option.input_nodes.keys(),
                                                    option.output_nodes.keys(),
-                                                   TFTransformGraphOptions[
-                                                       option.device])
+                                                   TFTransformGraphOptions)
         except Exception as ex:
             print("Failed to transform graph using tf tool: %s" % ex)
             transformed_graph_def = tf_graph_def
 
+        # To check optimized model, uncomment following code.
+        # tf.io.write_graph(
+        #     transformed_graph_def,
+        #     ".",
+        #     os.path.basename(src_model_file)[:-3] + "_opt.pb",
+        #     as_text=False
+        # )
+
+        self.add_shape_info(transformed_graph_def)
+
         with tf.Session() as session:
             with session.graph.as_default() as graph:
                 tf.import_graph_def(transformed_graph_def, name='')
                 self._tf_graph = graph
+                self.update_output_shapes(session)
 
-        self._skip_tensor = set()
-        self._output_shape_list = []
-        self._output_shape_op_list = []
+        # we have polluted graph with 'shape' ops, so reset it and reload it
+        # again
+        tf.reset_default_graph()
+
+        with tf.Session() as session:
+            with session.graph.as_default() as graph:
+                tf.import_graph_def(transformed_graph_def, name='')
+                self._tf_graph = graph
 
     def run(self):
         with tf.Session() as session:
@@ -340,13 +333,19 @@ class TensorflowConverter(base_converter.ConverterInterface):
             for input_node in self._option.input_nodes.values():
                 if node.name == input_node.name \
                         or node.name + ':0' == input_node.name:
+                    input_shape = input_node.shape
+                    if input_node.data_format == DataFormat.OIHW \
+                            and len(input_shape) == 4:
+                        # OIHW -> HWIO
+                        input_shape = [input_shape[2], input_shape[3],
+                                       input_shape[1], input_shape[0]]
                     del node.attr['shape'].shape.dim[:]
                     node.attr['shape'].shape.dim.extend([
                         tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in
-                        input_node.shape
+                        input_shape
                     ])
                     self._placeholders[node.name + ':0'] = \
-                        np.zeros(shape=input_node.shape, dtype=float)
+                        np.zeros(shape=input_shape, dtype=float)
 
     @staticmethod
     def get_scope(tensor_name):
@@ -357,10 +356,17 @@ class TensorflowConverter(base_converter.ConverterInterface):
             return tensor_name[:idx]
 
     def update_output_shapes(self, sess):
-        output_shapes = sess.run(self._output_shape_op_list,
+        tensors = []
+        shape_tensors = []
+        for tf_op in self._tf_graph.get_operations():
+            for output in tf_op.outputs:
+                tensors.append(output.name)
+                shape_tensors.append(tf.shape(output))
+
+        tensor_shapes = sess.run(shape_tensors,
                                  feed_dict=self._placeholders)
-        for i in range(len(self._output_shape_list)):
-            self._output_shape_list[i].dims.extend(output_shapes[i])
+        for i in range(len(tensors)):
+            self._output_shape[tensors[i]] = tensor_shapes[i]
 
     def convert_ops(self, sess):
         for tf_op in self._tf_graph.get_operations():
@@ -368,7 +374,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
                        "Mace does not support tensorflow op type %s yet"
                        % tf_op.type)
             self._op_converters[tf_op.type](tf_op)
-        self.update_output_shapes(sess)
+
         self.convert_tensors()
 
     def convert_tensors(self):
@@ -402,18 +408,17 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
     # this function tries to infer tensor shape, but some dimension shape
     # may be undefined due to variance of input length
-    def infer_tensor_shape(self, output_shape, tensor):
-        inferred_tensor_shape = tensor.shape.as_list()
-        inferred_success = True
-        for _, dim in enumerate(inferred_tensor_shape):
-            if dim is None:
-                inferred_success = False
-                break
-        if inferred_success:
-            output_shape.dims.extend(inferred_tensor_shape)
+    def infer_tensor_shape(self, tensor, output_shape=None):
+        shape = None
+        if tensor.name in self._output_shape:
+            shape = self._output_shape[tensor.name]
         else:
-            self._output_shape_list.append(output_shape)
-            self._output_shape_op_list.append(tf.shape(tensor))
+            shape = tensor.shape.as_list()
+
+        if output_shape:
+            output_shape.dims.extend(shape)
+
+        return shape
 
     def convert_nop(self, tf_op):
         pass
@@ -426,7 +431,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         op.output.extend([tf_output.name for tf_output in tf_op.outputs])
         for tf_output in tf_op.outputs:
             output_shape = op.output_shape.add()
-            self.infer_tensor_shape(output_shape, tf_output)
+            self.infer_tensor_shape(tf_output, output_shape)
 
         data_type_arg = op.arg.add()
         data_type_arg.name = 'T'
@@ -509,10 +514,10 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
         def check_is_scalar(tf_op):
             if len(tf_op.inputs) == 1:
-                return len(tf_op.inputs[0].shape) == 0
+                return len(self.infer_tensor_shape(tf_op.inputs[0])) == 0
             elif len(tf_op.inputs) == 2:
-                return len(tf_op.inputs[0].shape) == 0 and \
-                       len(tf_op.inputs[1].shape) == 0
+                return len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 and \
+                       len(self.infer_tensor_shape(tf_op.inputs[1])) == 0
 
         if check_is_scalar(tf_op):
             op.type = MaceOp.ScalarMath.name
@@ -539,9 +544,9 @@ class TensorflowConverter(base_converter.ConverterInterface):
                         EltwiseType.SUM, EltwiseType.PROD,
                         EltwiseType.MAX, EltwiseType.MIN]
 
-                if len(tf_op.inputs) > 1 and \
-                        len(tf_op.inputs[1].shape) == 0 and \
-                        tf_op.inputs[1].op.type == TFOpType.Const.name:
+                if (len(tf_op.inputs) > 1 and
+                        len(self.infer_tensor_shape(tf_op.inputs[1])) == 0 and
+                        tf_op.inputs[1].op.type == TFOpType.Const.name):
                     scalar = tf_op.inputs[1].eval().astype(np.float32)
                     value_arg = op.arg.add()
                     value_arg.name = MaceKeyword.mace_scalar_input_str
@@ -553,7 +558,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
                     value_index_arg.i = 1
                     self._skip_tensor.add(tf_op.inputs[1].name)
                     del op.input[1]
-                elif len(tf_op.inputs[0].shape) == 0 and \
+                elif len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 and \
                         tf_op.inputs[0].op.type == TFOpType.Const.name and \
                         is_commutative(type_arg.i):
                     scalar = tf_op.inputs[0].eval().astype(np.float32)
@@ -1034,3 +1039,23 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
         self._skip_tensor.add(tf_op.inputs[1].name)
         self._skip_tensor.add(tf_op.inputs[2].name)
+
+    def convert_cumsum(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.Cumsum.name
+
+        axis = tf_op.inputs[1].eval().astype(np.int32)
+        axis_arg = op.arg.add()
+        axis_arg.name = MaceKeyword.mace_axis_str
+        axis_arg.i = axis
+        del op.input[1]
+
+        exclusive = tf_op.get_attr('exclusive')
+        exclusive_arg = op.arg.add()
+        exclusive_arg.name = MaceKeyword.mace_exclusive_str
+        exclusive_arg.i = int(exclusive)
+
+        reverse = tf_op.get_attr('reverse')
+        reverse_arg = op.arg.add()
+        reverse_arg.name = MaceKeyword.mace_reverse_str
+        reverse_arg.i = int(reverse)
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 33d4633635528b94a3d8d0ed108398368572a36c..6cae50dc2d9aa3b4e72b826371d38538b5061844 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -25,7 +25,6 @@ from mace.python.tools.converter_tool.base_converter import DataFormat
 from mace.python.tools.converter_tool.base_converter import DeviceType
 from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
-from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
@@ -103,6 +102,8 @@ class Transformer(base_converter.ConverterInterface):
                 self.transform_caffe_reshape_and_flatten,
             TransformerRule.TRANSFORM_CHANNEL_SHUFFLE:
                 self.transform_channel_shuffle,
+            TransformerRule.QUANTIZE_SPECIFIC_OPS_ONLY:
+                self.quantize_specific_ops_only,
         }
 
         self._option = option
@@ -127,7 +128,7 @@ class Transformer(base_converter.ConverterInterface):
                 self.construct_ops_and_consumers(key)
                 changed = transformer()
                 if not changed:
-                        break
+                    break
         self.delete_after_check_nodes()
         return self._model, self._quantize_activation_info
 
@@ -147,12 +148,12 @@ class Transformer(base_converter.ConverterInterface):
         filter_format_value = ConverterUtil.get_arg(self._model,
                                                     MaceKeyword.mace_filter_format_str).i  # noqa
         filter_format = None
-        if filter_format_value == FilterFormat.HWIO.value:
-            filter_format = FilterFormat.HWIO
-        elif filter_format_value == FilterFormat.OIHW.value:
-            filter_format = FilterFormat.OIHW
-        elif filter_format_value == FilterFormat.HWOI.value:
-            filter_format = FilterFormat.HWOI
+        if filter_format_value == DataFormat.HWIO.value:
+            filter_format = DataFormat.HWIO
+        elif filter_format_value == DataFormat.OIHW.value:
+            filter_format = DataFormat.OIHW
+        elif filter_format_value == DataFormat.HWOI.value:
+            filter_format = DataFormat.HWOI
         else:
             mace_check(False, "filter format %d not supported" %
                        filter_format_value)
@@ -191,16 +192,23 @@ class Transformer(base_converter.ConverterInterface):
                     op = mace_pb2.OperatorDef()
                     op.name = self.normalize_op_name(input_node.name)
                     op.type = "Input"
+                    data_type_arg = op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_op_data_type_str
+                    data_type_arg.i = mace_pb2.DT_FLOAT
                     op.output.extend([input_node.name])
                     output_shape = op.output_shape.add()
                     output_shape.dims.extend(input_node.shape)
-                    if ConverterUtil.data_format(
-                            self._consumers[input_node.name][0]) \
-                            == DataFormat.NCHW:
-                        self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-                        ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
-                    else:
-                        ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+                    if input_node in self._consumers:
+                        if ConverterUtil.data_format(
+                                self._consumers[input_node.name][0]) \
+                                == DataFormat.NCHW:
+                            self.transpose_shape(output_shape.dims,
+                                                 [0, 3, 1, 2])
+                            ConverterUtil.add_data_format_arg(op,
+                                                              DataFormat.NCHW)
+                        else:
+                            ConverterUtil.add_data_format_arg(op,
+                                                              DataFormat.NHWC)
                     self._producer[op.output[0]] = op
 
     @staticmethod
@@ -221,10 +229,32 @@ class Transformer(base_converter.ConverterInterface):
         return name.replace(':', '_')
 
     def get_tensor_shape(self, tensor):
-        producer = self._producer[tensor]
-        for i in six.moves.range(len(producer.output)):
-            if producer.output[i] == tensor:
-                return list(producer.output_shape[i].dims)
+        if tensor in self._consts:
+            return list(self._consts[tensor].dims)
+        elif tensor in self._producer:
+            producer = self._producer[tensor]
+            for i in six.moves.range(len(producer.output)):
+                if producer.output[i] == tensor:
+                    return list(producer.output_shape[i].dims)
+        else:
+            return None
+
+    def get_tensor_data_type(self, tensor):
+        if tensor in self._consts:
+            return self._consts[tensor].data_type
+        elif tensor in self._producer:
+            producer = self._producer[tensor]
+            for i in six.moves.range(len(producer.output)):
+                if producer.output[i] == tensor:
+                    if i < len(producer.output_type):
+                        return producer.output_type[i]
+                    elif ConverterUtil.get_arg(producer, "T") is not None:
+                        return ConverterUtil.get_arg(producer, "T").i
+                    else:
+                        print("No data type filled: ", producer)
+                        return None
+        else:
+            return None
 
     def consumer_count(self, tensor_name):
         return len(self._consumers.get(tensor_name, []))
@@ -583,14 +613,14 @@ class Transformer(base_converter.ConverterInterface):
                     offset = self._consts[consumer_op.input[2]]
                     idx = 0
                     filter_format = self.filter_format()
-                    if filter_format == FilterFormat.HWIO:
+                    if filter_format == DataFormat.HWIO:
                         for hwi in six.moves.range(filter.dims[0]
                                                    * filter.dims[1]
                                                    * filter.dims[2]):
                             for o in six.moves.range(filter.dims[3]):
                                 filter.float_data[idx] *= scale.float_data[o]
                                 idx += 1
-                    elif filter_format == FilterFormat.OIHW:
+                    elif filter_format == DataFormat.OIHW:
                         for o in six.moves.range(filter.dims[0]):
                             for hwi in six.moves.range(filter.dims[1]
                                                        * filter.dims[2]
@@ -642,7 +672,7 @@ class Transformer(base_converter.ConverterInterface):
                     idx = 0
                     filter_format = self.filter_format()
                     # in deconv op O and I channel is switched
-                    if filter_format == FilterFormat.HWIO:
+                    if filter_format == DataFormat.HWIO:
                         for hw in six.moves.range(filter.dims[0]
                                                   * filter.dims[1]):
                             for o in six.moves.range(filter.dims[2]):
@@ -650,7 +680,7 @@ class Transformer(base_converter.ConverterInterface):
                                     filter.float_data[idx] *=\
                                         scale.float_data[o]
                                     idx += 1
-                    elif filter_format == FilterFormat.OIHW:
+                    elif filter_format == DataFormat.OIHW:
                         for i in six.moves.range(filter.dims[0]):
                             for o in six.moves.range(filter.dims[1]):
                                 for hw in six.moves.range(filter.dims[2]
@@ -705,7 +735,7 @@ class Transformer(base_converter.ConverterInterface):
                     idx = 0
 
                     filter_format = self.filter_format()
-                    if filter_format == FilterFormat.HWIO:
+                    if filter_format == DataFormat.HWIO:
                         for hw in six.moves.range(filter.dims[0]
                                                   * filter.dims[1]):
                             for i in six.moves.range(filter.dims[2]):
@@ -713,7 +743,7 @@ class Transformer(base_converter.ConverterInterface):
                                     filter.float_data[idx] *= scale.float_data[
                                                         i * filter.dims[3] + o]
                                     idx += 1
-                    elif filter_format == FilterFormat.OIHW:
+                    elif filter_format == DataFormat.OIHW:
                         for o in six.moves.range(filter.dims[0]):
                             for i in six.moves.range(filter.dims[1]):
                                 for hw in six.moves.range(filter.dims[2]
@@ -760,17 +790,17 @@ class Transformer(base_converter.ConverterInterface):
     @staticmethod
     def sort_filter_shape(filter_shape, filter_format):
         """Return filter shape in HWIO order"""
-        if filter_format == FilterFormat.HWIO:
+        if filter_format == DataFormat.HWIO:
             filter_height = filter_shape[0]
             filter_width = filter_shape[1]
             in_channels = filter_shape[2]
             out_channels = filter_shape[3]
-        elif filter_format == FilterFormat.OIHW:
+        elif filter_format == DataFormat.OIHW:
             filter_height = filter_shape[2]
             filter_width = filter_shape[3]
             in_channels = filter_shape[1]
             out_channels = filter_shape[0]
-        elif filter_format == FilterFormat.HWOI:
+        elif filter_format == DataFormat.HWOI:
             filter_height = filter_shape[0]
             filter_width = filter_shape[1]
             in_channels = filter_shape[3]
@@ -933,7 +963,9 @@ class Transformer(base_converter.ConverterInterface):
 
         net = self._model
         for op in net.op:
-            if op.type == MaceOp.Conv2D.name:
+            if op.type == MaceOp.Conv2D.name \
+                    and len(op.input) >= 2 \
+                    and op.input[1] in self._consts:
                 producer = self._producer[op.input[0]]
                 input_shape = producer.output_shape[0].dims
                 batch, height, width, channels = self.sort_feature_map_shape(
@@ -975,12 +1007,13 @@ class Transformer(base_converter.ConverterInterface):
                     input_shape = list(input_op.output_shape[0].dims)
                     weight.dims[:] = [weight.dims[0]] + input_shape[1:]
                     if len(input_shape) == 2:
-                        if filter_format == FilterFormat.HWIO:
+                        if filter_format == DataFormat.HWIO:
                             weight.dims[:] = [1, 1] + weight.dims[:]
-                        elif filter_format == FilterFormat.OIHW:
+                        elif filter_format == DataFormat.OIHW:
                             weight.dims[:] = weight.dims[:] + [1, 1]
                         else:
-                            mace_check("FC does not support filter format %s",
+                            mace_check(False,
+                                       "FC does not support filter format %s" %
                                        filter_format.name)
         return False
 
@@ -1052,6 +1085,16 @@ class Transformer(base_converter.ConverterInterface):
                             new_axises.sort()
                             arg.ints[:] = []
                             arg.ints.extend(new_axises)
+            elif op.type == MaceOp.Crop.name:
+                offset_arg = ConverterUtil.get_arg(op,
+                                                   MaceKeyword.mace_offset_str)
+                mace_check(offset_arg and
+                           ConverterUtil.data_format(op) == DataFormat.NCHW and
+                           len(op.output_shape[0].dims) == 4,
+                           "MACE only support crop with NCHW format")
+                print("Transpose crop args: %s(%s)"
+                      % (op.name, op.type))
+                self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])
 
             # transpose op output shape
             data_format = ConverterUtil.data_format(op)
@@ -1087,7 +1130,7 @@ class Transformer(base_converter.ConverterInterface):
                 rhs = op.input[1]
                 if rhs in self._consts and len(self._consts[rhs].dims) == 2:
                     arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str)  # noqa
-                    six.print_('transpose matmul weight')
+                    six.print_("Transpose matmul weight %s" % rhs)
                     if arg is None:
                         arg = op.arg.add()
                         arg.name = MaceKeyword.mace_transpose_b_str
@@ -1110,12 +1153,12 @@ class Transformer(base_converter.ConverterInterface):
         if self._option.quantize and \
                 self._option.device == DeviceType.CPU.value:
             print("Transpose filters to OHWI")
-            if filter_format == FilterFormat.HWIO:
+            if filter_format == DataFormat.HWIO:
                 transpose_order = [3, 0, 1, 2]
-            elif filter_format == FilterFormat.OIHW:
+            elif filter_format == DataFormat.OIHW:
                 transpose_order = [0, 2, 3, 1]
             else:
-                mace_check("Quantize model does not support conv "
+                mace_check(False, "Quantize model does not support conv "
                            "filter format: %s" % filter_format.name)
 
             for op in net.op:
@@ -1141,20 +1184,22 @@ class Transformer(base_converter.ConverterInterface):
                     filter.dims[:] = filter_data.shape
                     transposed_deconv_filter.add(op.input[1])
 
-            self.set_filter_format(FilterFormat.OHWI)
+            self.set_filter_format(DataFormat.OHWI)
         elif self._option.quantize and \
-                self._option.device == DeviceType.HEXAGON.value:
+                (self._option.device == DeviceType.HEXAGON.value or
+                 self._option.device == DeviceType.HTA.value):
             print("Transpose filters to HWIO/HWIM")
-            mace_check(filter_format == FilterFormat.HWIO,
+            mace_check(filter_format == DataFormat.HWIO,
                        "HEXAGON only support HWIO/HWIM filter format.")
         else:
             print("Transpose filters to OIHW/MIHW")
             # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM)
-            if filter_format == FilterFormat.HWIO:
+            if filter_format == DataFormat.HWIO:
                 for op in net.op:
                     if (op.type == MaceOp.Conv2D.name
                             or op.type == MaceOp.Deconv2D.name
                             or op.type == MaceOp.DepthwiseConv2d.name) \
+                            and op.input[1] in self._consts \
                             and op.input[1] not in transposed_filter:
                         filter = self._consts[op.input[1]]
                         filter_data = np.array(filter.float_data).reshape(
@@ -1184,7 +1229,7 @@ class Transformer(base_converter.ConverterInterface):
                             weight.dims[:] = weight_data.shape
                             transposed_filter.add(op.input[1])
 
-                self.set_filter_format(FilterFormat.OIHW)
+                self.set_filter_format(DataFormat.OIHW)
             # deconv's filter's output channel and input channel is reversed
             for op in net.op:
                 if op.type in [MaceOp.Deconv2D.name,
@@ -1265,7 +1310,7 @@ class Transformer(base_converter.ConverterInterface):
                     len(op.input) == 2 and \
                     op.input[1] in self._consts and \
                     len(op.output_shape[0].dims) == 2 and \
-                    filter_format == FilterFormat.HWIO and \
+                    filter_format == DataFormat.HWIO and \
                     op.input[0] in self._producer:
                 input_op = self._producer[op.input[0]]
                 input_shape = input_op.output_shape[0].dims
@@ -1298,7 +1343,8 @@ class Transformer(base_converter.ConverterInterface):
 
             # transform `fc1(2D) -> matmul` to `fc1(2D) -> fc1(2D)`
             if op.type == MaceOp.MatMul.name and \
-                    filter_format == FilterFormat.HWIO:
+                    filter_format == DataFormat.HWIO and \
+                    op.input[1] in self._consts:
                 producer = self._producer[op.input[0]]
                 weight = self._consts[op.input[1]]
                 if len(weight.dims) == 2 and self.is_after_fc(op) and \
@@ -1373,21 +1419,18 @@ class Transformer(base_converter.ConverterInterface):
         return False
 
     def update_data_format(self):
-        data_format_flag = DataFormat.NHWC.value
+        print("update data format")
+        data_format_flag = 1
         for input_node in self._option.input_nodes.values():
             if input_node.data_format.value == DataFormat.DF_NONE.value:
-                data_format_flag = DataFormat.DF_NONE.value
-
+                data_format_flag = 0
         net = self._model
         for op in net.op:
-            data_format_arg = ConverterUtil.get_arg(
+            ConverterUtil.del_arg(
                 op, MaceKeyword.mace_data_format_str)
-            if not data_format_arg:
-                data_format_arg = op.arg.add()
-                data_format_arg.name = MaceKeyword.mace_data_format_str
-                data_format_arg.i = data_format_flag
-            elif data_format_arg.i != data_format_flag:
-                data_format_arg.i = data_format_flag
+            has_data_format_arg = op.arg.add()
+            has_data_format_arg.name = MaceKeyword.mace_has_data_format_str
+            has_data_format_arg.i = data_format_flag
         return False
 
     def quantize_nodes(self):
@@ -1423,10 +1466,11 @@ class Transformer(base_converter.ConverterInterface):
             else:
                 mace_check(op.type == MaceOp.Quantize.name,
                            "Quantization only support float ops, "
-                           "but get %s(%s)"
-                           % (op.name, op.type))
+                           "but get %s(%s, %s)"
+                           % (op.name, op.type,
+                              mace_pb2.DataType.Name(data_type_arg.i)))
 
-        for input_node in self._option.input_nodes.values():
+        for i, input_node in enumerate(self._option.input_nodes.values()):
             new_input_name = self.input_name_map[input_node.name]
             op_def = self._model.op.add()
             op_def.name = self.normalize_op_name(new_input_name)
@@ -1435,8 +1479,10 @@ class Transformer(base_converter.ConverterInterface):
             op_def.output.extend([new_input_name])
             output_shape = op_def.output_shape.add()
             output_shape.dims.extend(input_node.shape)
-            self.copy_quantize_info(
-                op_def, self._quantize_activation_info[new_input_name])
+            quantize_info = self._quantize_activation_info[new_input_name]
+            self.copy_quantize_info(op_def, quantize_info)
+            self._model.input_info[i].scale = quantize_info.scale
+            self._model.input_info[i].zero_point = quantize_info.zero_point
 
             ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
             ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
@@ -1447,16 +1493,19 @@ class Transformer(base_converter.ConverterInterface):
             find_range_every_time_arg.i = 1
 
         output_nodes = self._option.check_nodes.values()
-        for output_node in output_nodes:
+        for i, output_node in enumerate(output_nodes):
             op_def = self._model.op.add()
             op_def.name = self.normalize_op_name(output_node.name)
             op_def.type = MaceOp.Dequantize.name
             op_def.input.extend([self.output_name_map[output_node.name]])
             op_def.output.extend([output_node.name])
             output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(
-                self._producer[output_node.name].output_shape[0].dims)
+            producer_op = self._producer[output_node.name]
+            output_shape.dims.extend(producer_op.output_shape[0].dims)
             op_def.output_type.extend([mace_pb2.DT_FLOAT])
+            quantize_info = producer_op.quantize_info[0]
+            self._model.output_info[i].scale = quantize_info.scale
+            self._model.output_info[i].zero_point = quantize_info.zero_point
 
             ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
 
@@ -1503,7 +1552,8 @@ class Transformer(base_converter.ConverterInterface):
                     quantized_tensor = \
                         quantize_util.quantize_with_scale_and_zero(
                             tensor.float_data, scale, 0)
-                elif self._option.device == DeviceType.HEXAGON.value:
+                elif self._option.device == DeviceType.HEXAGON.value or \
+                        self._option.device == DeviceType.HTA.value:
                     quantized_tensor = \
                         quantize_util.quantize_bias_for_hexagon(
                             tensor.float_data)
@@ -1661,7 +1711,7 @@ class Transformer(base_converter.ConverterInterface):
             return False
 
         print("Add default quantize info for input")
-        for input_node in self._option.input_nodes.values():
+        for i, input_node in enumerate(self._option.input_nodes.values()):
             if input_node.name not in self._quantize_activation_info:
                 print("Input range %s: %s" % (input_node.name,
                                               str(input_node.range)))
@@ -1670,7 +1720,8 @@ class Transformer(base_converter.ConverterInterface):
                     quantize_util.adjust_range(input_node.range[0],
                                                input_node.range[1],
                                                non_zero=False)
-                quantize_info = mace_pb2.QuantizeActivationInfo()
+                quantize_info = \
+                    mace_pb2.QuantizeActivationInfo()
                 quantize_info.minval = minval
                 quantize_info.maxval = maxval
                 quantize_info.scale = scale
@@ -1725,18 +1776,29 @@ class Transformer(base_converter.ConverterInterface):
                     self.add_quantize_info(op, 0.0, 1.0)
                 self._quantize_activation_info[op.output[0]] = quantize_info
             elif (op.type == MaceOp.Eltwise.name
-                  and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value  # noqa
                   and not op.quantize_info
                   and len(op.input) == 2
                   and len(op.input[0]) not in self._consts
                   and len(op.input[1]) not in self._consts):
-                del op.quantize_info[:]
                 producer_op0 = self._producer[op.input[0]]
                 producer_op1 = self._producer[op.input[1]]
-                minval = producer_op0.quantize_info[0].minval \
-                    + producer_op1.quantize_info[0].minval
-                maxval = producer_op0.quantize_info[0].maxval \
-                    + producer_op1.quantize_info[0].maxval
+                if ConverterUtil.get_arg(
+                        op, MaceKeyword.mace_element_type_str).i \
+                        == EltwiseType.SUM.value:
+                    minval = producer_op0.quantize_info[0].minval \
+                        + producer_op1.quantize_info[0].minval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        + producer_op1.quantize_info[0].maxval
+                elif ConverterUtil.get_arg(
+                        op, MaceKeyword.mace_element_type_str).i \
+                        == EltwiseType.SUB.value:
+                    minval = producer_op0.quantize_info[0].minval \
+                        - producer_op1.quantize_info[0].maxval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        - producer_op1.quantize_info[0].minval
+                else:
+                    mace_check(False, "Quantized Elementwise only support:"
+                                      " SUM and SUB now.")
                 quantize_info = \
                     self.add_quantize_info(op, minval, maxval)
                 self._quantize_activation_info[op.output[0]] = quantize_info
@@ -1880,3 +1942,131 @@ class Transformer(base_converter.ConverterInterface):
                             producer_op.output_shape[0].dims[:] = output_shape
 
                     return True
+
+    def quantize_specific_ops_only(self):
+        """
+        This transform rule is only used internally, we are not gonna make
+        things too complex for users
+        """
+        to_quantize_ops_output_type = {
+            MaceOp.MatMul.name: mace_pb2.DT_INT32,
+            MaceOp.Gather.name: mace_pb2.DT_UINT8,
+        }
+
+        for op in self._model.op:
+            if (op.type not in to_quantize_ops_output_type
+                    or len(op.output) > 1
+                    or ConverterUtil.get_arg(op,
+                                             MaceKeyword.mace_op_data_type_str).i != mace_pb2.DT_FLOAT):  # noqa
+                # only support single output
+                continue
+
+            quantized_inputs_names = []
+
+            should_quantize = False
+            has_const = False
+            for idx, input_tensor in enumerate(op.input):
+                if input_tensor in self._consts:
+                    has_const = True
+                    break
+            if not has_const:
+                continue
+
+            for idx, input_tensor in enumerate(op.input):
+                if self.get_tensor_data_type(input_tensor) \
+                        == mace_pb2.DT_FLOAT:
+                    should_quantize = True
+                    break
+            if not should_quantize:
+                continue
+            else:
+                print("Quantize op %s (%s)" % (op.name, op.type))
+
+            non_zero = self._option.device == DeviceType.CPU.value \
+                and op.type == MaceOp.MatMul.name
+
+            for idx, input_tensor in enumerate(op.input):
+                quantized_inputs_names.append(input_tensor)
+
+                if self.get_tensor_data_type(input_tensor) \
+                        != mace_pb2.DT_FLOAT:
+                    continue
+
+                if input_tensor in self._consts:
+                    const_tensor = self._consts[input_tensor]
+                    quantized_tensor = quantize_util.quantize(
+                        const_tensor.float_data, non_zero)
+                    del const_tensor.float_data[:]
+                    const_tensor.int32_data.extend(quantized_tensor.data)
+                    const_tensor.data_type = mace_pb2.DT_UINT8
+                    const_tensor.scale = quantized_tensor.scale
+                    const_tensor.zero_point = quantized_tensor.zero
+                    const_tensor.minval = quantized_tensor.minval
+                    const_tensor.maxval = quantized_tensor.maxval
+                    const_tensor.quantized = True
+                else:
+                    input_shape = self.get_tensor_shape(input_tensor)
+                    quantize_op = self._model.op.add()
+                    quantize_op.name = self.normalize_op_name(
+                        input_tensor) + "_quant"
+                    quantize_op.type = MaceOp.Quantize.name
+                    quantize_op.input.extend([input_tensor])
+                    quantize_output_name = quantize_op.name + '_0'
+                    quantize_op.output.extend([quantize_output_name])
+                    output_shape = quantize_op.output_shape.add()
+                    output_shape.dims.extend(input_shape)
+                    quantize_op.output_type.extend([mace_pb2.DT_UINT8])
+                    data_type_arg = quantize_op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_op_data_type_str
+                    data_type_arg.i = mace_pb2.DT_UINT8
+
+                    data_type_arg = quantize_op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_non_zero
+                    if non_zero:
+                        data_type_arg.i = 1
+                    else:
+                        data_type_arg.i = 0
+
+                    find_range_arg = quantize_op.arg.add()
+                    find_range_arg.name = \
+                        MaceKeyword.mace_find_range_every_time
+                    find_range_arg.i = 1
+
+                    quantized_inputs_names[-1] = quantize_output_name
+
+                non_zero = False
+
+            del op.input[:]
+            op.input.extend(quantized_inputs_names)
+
+            orginal_output_name = op.output[0]
+            op.output[0] = orginal_output_name + "_quant"
+            op.output_type.extend([to_quantize_ops_output_type[op.type]])
+            data_type_arg = ConverterUtil.get_arg(op,
+                                                  MaceKeyword.mace_op_data_type_str)  # noqa
+            if data_type_arg is None:
+                data_type_arg = op.arg.add()
+                data_type_arg.name = MaceKeyword.mace_op_data_type_str
+            data_type_arg.i = mace_pb2.DT_UINT8
+
+            dequantize_op = self._model.op.add()
+            dequantize_op.name = op.name + "_dequant"
+            dequantize_op.type = MaceOp.Dequantize.name
+            dequantize_op.input.extend([op.output[0]])
+            dequantize_op.output.extend([orginal_output_name])
+            dequantize_op.output_shape.extend(op.output_shape)
+            dequantize_op.output_type.extend([mace_pb2.DT_FLOAT])
+            data_type_arg = dequantize_op.arg.add()
+            data_type_arg.name = MaceKeyword.mace_op_data_type_str
+            data_type_arg.i = to_quantize_ops_output_type[op.type]
+
+            quantize_flag_arg = ConverterUtil.get_arg(self._model,
+                                                      MaceKeyword.mace_quantize_flag_arg_str)  # noqa
+            if quantize_flag_arg is None:
+                quantize_flag_arg = self._model.arg.add()
+                quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str
+                quantize_flag_arg.i = 1
+
+            return True
+
+        return False
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index 1beae21ab41ca1fc583fbd016b4e6d8430ad2ff9..89bee8d8f9dba8ce27ff97ff016381eb7b9da5e7 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -16,10 +16,10 @@
 
 #include <string>
 
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
@@ -75,7 +75,7 @@ void CreateNetArg(NetDef *net_def) {
 {% if net.input_info | length > 0 %}
 void CreateInputInfo(NetDef *net_def) {
   net_def->mutable_input_info()->Reserve({{ net.input_info | length }});
-  InputInfo *input_info = nullptr;
+  InputOutputInfo *input_info = nullptr;
   {% for idx in range(net.input_info|length) %}
   input_info = net_def->add_input_info();
   input_info->set_name({{ net.input_info[idx].name|tojson }});
@@ -92,7 +92,7 @@ void CreateInputInfo(NetDef *net_def) {
 {% if net.output_info | length > 0 %}
 void CreateOutputInfo(NetDef *net_def) {
   net_def->mutable_output_info()->Reserve({{ net.output_info | length }});
-  OutputInfo *output_info = nullptr;
+  InputOutputInfo *output_info = nullptr;
   {% for idx in range(net.output_info|length) %}
   output_info = net_def->add_output_info();
   output_info->set_name({{ net.output_info[idx].name|tojson }});
diff --git a/mace/python/tools/operator.jinja2 b/mace/python/tools/operator.jinja2
index 8992da31ef7c9468b723d362ac04ab98511593f5..b184b54a3d98f034147866d04a6b48c1af0703f9 100644
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
@@ -19,7 +19,7 @@
 
 #include "mace/proto/mace.pb.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/python/tools/quantization/quantize_util.py b/mace/python/tools/quantization/quantize_util.py
index 349393870e24e39073761bd10faffc4277a7335d..666b94bdf58e6311e50d5351df8b233a60f50922 100644
--- a/mace/python/tools/quantization/quantize_util.py
+++ b/mace/python/tools/quantization/quantize_util.py
@@ -100,7 +100,7 @@ def cal_multiplier_and_shift(scale):
 
 
 def quantize_with_scale_and_zero(data, scale, zero):
-    output = np.round(zero + data / scale).astype(int)
+    output = np.round(zero + data / scale).astype(np.int32)
     quantized_data = QuantizedData()
     quantized_data.data = output
     quantized_data.scale = scale
@@ -114,7 +114,7 @@ def quantize(data, non_zero):
     in_max = np_data.max()
     scale, zero, out_min, out_max = adjust_range(in_min, in_max,
                                                  non_zero=non_zero)
-    output = np.clip((np.round(zero + data / scale).astype(int)), 0, 255)
+    output = np.clip((np.round(zero + data / scale).astype(np.int32)), 0, 255)
 
     quantized_data = QuantizedData()
     quantized_data.data = output
@@ -132,7 +132,7 @@ def quantize_bias_for_hexagon(data):
     in_max = max_val
     scale = (in_max - in_min) / 2**32
     zero = 0
-    output = np.clip((np.round(zero + data / scale).astype(long)),
+    output = np.clip((np.round(zero + data / scale).astype(np.int64)),
                      -2**31, 2**31 - 1)
 
     quantized_data = QuantizedData()
diff --git a/mace/python/tools/tensor_source.jinja2 b/mace/python/tools/tensor_source.jinja2
index 77d91eab6aff431549b8e848369503944d52d5d3..d459d9bc806d23f7cb49ad90ba72f2a753dfd886 100644
--- a/mace/python/tools/tensor_source.jinja2
+++ b/mace/python/tools/tensor_source.jinja2
@@ -16,7 +16,7 @@
 
 #include "mace/proto/mace.pb.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/python/tools/visualization/BUILD b/mace/python/tools/visualization/BUILD.bazel
similarity index 100%
rename from mace/python/tools/visualization/BUILD
rename to mace/python/tools/visualization/BUILD.bazel
diff --git a/mace/python/tools/visualization/index.html b/mace/python/tools/visualization/index.html
index f36ea26066f44b6d1d999386d8e9035618b55fa6..658897fa646c159b06ece05aeca16b90e4c82fdf 100644
--- a/mace/python/tools/visualization/index.html
+++ b/mace/python/tools/visualization/index.html
@@ -111,7 +111,11 @@ Click node to see details at bottom of this page.
             var output_shapes = [];
             if (typeof node["outputShape"] !== "undefined") {
                 for (var j = 0; j < node["outputShape"].length; j++) {
-                    var output_shape = node["outputShape"][j].dims.join(",");
+                    var output_shape = "";
+                    if (typeof node["outputShape"][j].dims !== "undefined") {
+                        console.log(node["outputShape"][j].dims);
+                        output_shape = node["outputShape"][j].dims.join(",");
+                    }
                     output_shapes.push(output_shape);
                 }
             }
@@ -140,11 +144,15 @@ Click node to see details at bottom of this page.
                     " min=" + node["minval"] +
                     " max=" + node["maxval"];
             }
+            var dims = "";
+            if (typeof node["dims"] != "undefined") {
+                dims = node["dims"].join(",");
+            }
             tensor_data.push({
                 "idx": tensor_data.length,
                 "name": node["name"],
                 "data_type": node["dataType"],
-                "dims": node["dims"].join(","),
+                "dims": dims,
                 "quantize_info": quantize_info
 
             })
diff --git a/mace/test/BUILD b/mace/test/BUILD.bazel
similarity index 93%
rename from mace/test/BUILD
rename to mace/test/BUILD.bazel
index 36a2b6472d46db4360b1840b6031f32f94212e40..a5c5f974552dd13b35faff26f7e14266e042b3fc 100644
--- a/mace/test/BUILD
+++ b/mace/test/BUILD.bazel
@@ -11,6 +11,7 @@ load(
     "if_openmp_enabled",
     "if_android_armv7",
     "if_hexagon_enabled",
+    "if_hta_enabled",
     "if_opencl_enabled",
     "if_quantize_enabled",
 )
@@ -45,6 +46,8 @@ cc_test(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -78,6 +81,8 @@ cc_test(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -111,6 +116,8 @@ cc_test(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -143,6 +150,8 @@ cc_test(
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
diff --git a/mace/test/mace_api_exception_test.cc b/mace/test/mace_api_exception_test.cc
index 075b04b40c7467d2d6a6dff10b6cb245521b68f5..232023dace17584f49c15a499b196c538f6598eb 100644
--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
@@ -29,7 +29,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
 
   std::shared_ptr<NetDef> net_def(new NetDef());
   for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
     info->set_name(input_names[i]);
   }
 
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 6124792c5f1e395777b3874860e570173cad51c8..ee14129a05dd23d7d2fa6b3bcc491da375c12096 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -45,14 +45,15 @@ void MaceRunFunc(const int in_out_size) {
       filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
+    info->set_data_format(DataFormat::NHWC);
     info->set_name(input_names[i]);
     for (auto d : input_shapes[0]) {
       info->add_dims(static_cast<int>(d));
     }
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
-    OutputInfo *info = net_def->add_output_info();
+    InputOutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 438683fec2f694b73ac0d5b132bb73f1bf6377db..0a852a17a9a9cfd6a7d331556b1ad1b1a85e397a 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -44,14 +44,15 @@ void MaceRun(const int in_out_size,
   AddTensor<T>(filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    InputInfo *info = net_def->add_input_info();
+    InputOutputInfo *info = net_def->add_input_info();
+    info->set_data_format(DataFormat::NHWC);
     info->set_name(input_names[i]);
     for (auto d : max_shape) {
       info->add_dims(static_cast<int>(d));
     }
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
-    OutputInfo *info = net_def->add_output_info();
+    InputOutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -123,12 +124,11 @@ TEST_F(MaceAPITest, MultipleInputOutput) {
 }
 
 TEST_F(MaceAPITest, VariableInputShape) {
-  // TODO(liyin): there is a bug of cpu convolution
-//  MaceRun<CPU, float>(1,
-//                      {1, 32, 64, 16},
-//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
-//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
-//                      {16, 16, 3, 3});
+  MaceRun<CPU, float>(1,
+                      {1, 32, 64, 16},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {16, 16, 3, 3});
   MaceRun<GPU, float>(1,
                       {1, 32, 64, 16},
                       {{1, 16, 32, 16}, {1, 32, 64, 16}},
diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h
index 2c2ed7d177fb2b1d834f427a5ecfaa956fe7e648..2257b2162ca6d53e81fd29367594bf860ff115ec 100644
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -76,6 +76,7 @@ void Conv3x3(const std::string &input_name,
       .AddIntArg("padding", Padding::SAME)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
       .Finalize(&operator_def);
 
   OutputShape *shape = operator_def.add_output_shape();
@@ -98,6 +99,7 @@ void Relu(const std::string &input_name,
       .AddStringArg("activation", "RELU")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .AddIntArg("device", static_cast<int>(device_type))
+      .AddIntArg("has_data_format", 1)
       .Finalize(&operator_def);
 
   net_def->add_op()->CopyFrom(operator_def);
diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD.bazel
similarity index 86%
rename from mace/tools/validation/BUILD
rename to mace/tools/validation/BUILD.bazel
index 7e238c00730ba3f3ad87259aa857be61f8e72653..d85283acbc9b1e407e3c7a0bf69ebf5182804897 100644
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD.bazel
@@ -29,14 +29,8 @@ cc_binary(
     ] + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]),
-    linkopts = [
-        "-lm",
-    ] + if_openmp_enabled([
-        "-fopenmp"
-    ]) + if_android([
-        "-ldl",
-        "-pie",
-        "-llog",
+    linkopts = if_openmp_enabled([
+        "-fopenmp",
     ]),
     linkstatic = 0,
     deps = [
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 7ea06f089eca63a189edac2306641bac81e39c7f..0653304fde80b275217eba9332ab4a121c169a9a 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -34,9 +34,10 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/string_util.h"
 
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -46,29 +47,6 @@ namespace mace {
 namespace tools {
 namespace validation {
 
-namespace str_util {
-
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  if (str.empty()) {
-    result.push_back("");
-    return result;
-  }
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-
-}  // namespace str_util
-
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   std::string tmp = str;
   while (!tmp.empty()) {
@@ -98,11 +76,25 @@ DeviceType ParseDeviceType(const std::string &device_str) {
     return DeviceType::GPU;
   } else if (device_str.compare("HEXAGON") == 0) {
     return DeviceType::HEXAGON;
+  } else if (device_str.compare("HTA") == 0) {
+    return DeviceType::HTA;
   } else {
     return DeviceType::CPU;
   }
 }
 
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
+
 struct mallinfo LogMallinfoChange(struct mallinfo prev) {
   struct mallinfo curr = mallinfo();
   if (prev.arena != curr.arena) {
@@ -168,6 +160,12 @@ DEFINE_string(output_node,
 DEFINE_string(output_shape,
               "1,224,224,2:1,1,1,10",
               "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file,
               "",
               "input file name | input file prefix for multiple inputs.");
@@ -206,8 +204,11 @@ DEFINE_int32(cpu_affinity_policy, 1,
 bool RunModel(const std::string &model_name,
               const std::vector<std::string> &input_names,
               const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
               const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes) {
+              const std::vector<std::vector<int64_t>> &output_shapes,
+              const std::vector<DataFormat> &output_data_formats,
+              float cpu_capability) {
   DeviceType device_type = ParseDeviceType(FLAGS_device);
 
   int64_t t0 = NowMicros();
@@ -243,20 +244,24 @@ bool RunModel(const std::string &model_name,
   }
 #endif  // MACE_ENABLE_OPENCL
 
-  std::vector<unsigned char> model_graph_data;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data;
   if (FLAGS_model_file != "") {
-    if (!mace::ReadBinaryFile(&model_graph_data, FLAGS_model_file)) {
+    auto fs = GetFileSystem();
+    status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
+        &model_graph_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
       LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
     }
   }
 
-  const unsigned char *model_weights_data = nullptr;
-  size_t model_weights_data_size = 0;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data;
   if (FLAGS_model_data_file != "") {
-    MemoryMap(FLAGS_model_data_file,
-              &model_weights_data,
-              &model_weights_data_size);
-    MACE_CHECK(model_weights_data != nullptr && model_weights_data_size != 0);
+    auto fs = GetFileSystem();
+    status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_data_file.c_str(),
+        &model_weights_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
+    }
   }
 
   std::shared_ptr<mace::MaceEngine> engine;
@@ -268,8 +273,9 @@ bool RunModel(const std::string &model_name,
 #ifdef MODEL_GRAPH_FORMAT_CODE
     create_engine_status =
           CreateMaceEngineFromCode(model_name,
-                                   model_weights_data,
-                                   model_weights_data_size,
+                                   reinterpret_cast<const unsigned char *>(
+                                     model_weights_data->data()),
+                                   model_weights_data->length(),
                                    input_names,
                                    output_names,
                                    config,
@@ -277,10 +283,12 @@ bool RunModel(const std::string &model_name,
 #else
     (void)(model_name);
     create_engine_status =
-        CreateMaceEngineFromProto(model_graph_data.data(),
-                                  model_graph_data.size(),
-                                  model_weights_data,
-                                  model_weights_data_size,
+        CreateMaceEngineFromProto(reinterpret_cast<const unsigned char *>(
+                                    model_graph_data->data()),
+                                  model_graph_data->length(),
+                                  reinterpret_cast<const unsigned char *>(
+                                    model_weights_data->data()),
+                                  model_weights_data->length(),
                                   input_names,
                                   output_names,
                                   config,
@@ -325,7 +333,8 @@ bool RunModel(const std::string &model_name,
       LOG(INFO) << "Open input file failed";
       return -1;
     }
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
+        input_data_formats[i]);
   }
 
   for (size_t i = 0; i < output_count; ++i) {
@@ -334,7 +343,8 @@ bool RunModel(const std::string &model_name,
                         std::multiplies<int64_t>());
     auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                              std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
+        output_data_formats[i]);
   }
 
   LOG(INFO) << "Warm up run";
@@ -349,18 +359,21 @@ bool RunModel(const std::string &model_name,
 #ifdef MODEL_GRAPH_FORMAT_CODE
         create_engine_status =
           CreateMaceEngineFromCode(model_name,
-                                   model_weights_data,
-                                   model_weights_data_size,
+                                   reinterpret_cast<const unsigned char *>(
+                                     model_weights_data->data()),
+                                   model_weights_data->length(),
                                    input_names,
                                    output_names,
                                    config,
                                    &engine);
 #else
         create_engine_status =
-            CreateMaceEngineFromProto(model_graph_data.data(),
-                                      model_graph_data.size(),
-                                      model_weights_data,
-                                      model_weights_data_size,
+            CreateMaceEngineFromProto(reinterpret_cast<const unsigned char *>(
+                                        model_graph_data->data()),
+                                      model_graph_data->length(),
+                                      reinterpret_cast<const unsigned char *>(
+                                        model_weights_data->data()),
+                                      model_weights_data->length(),
                                       input_names,
                                       output_names,
                                       config,
@@ -392,22 +405,26 @@ bool RunModel(const std::string &model_name,
 #ifdef MODEL_GRAPH_FORMAT_CODE
             create_engine_status =
               CreateMaceEngineFromCode(model_name,
-                                       model_weights_data,
-                                       model_weights_data_size,
+                                       reinterpret_cast<const unsigned char *>(
+                                         model_weights_data->data()),
+                                       model_weights_data->length(),
                                        input_names,
                                        output_names,
                                        config,
                                        &engine);
 #else
             create_engine_status =
-                CreateMaceEngineFromProto(model_graph_data.data(),
-                                          model_graph_data.size(),
-                                          model_weights_data,
-                                          model_weights_data_size,
-                                          input_names,
-                                          output_names,
-                                          config,
-                                          &engine);
+                CreateMaceEngineFromProto(
+                    reinterpret_cast<const unsigned char *>(
+                      model_graph_data->data()),
+                    model_graph_data->length(),
+                    reinterpret_cast<const unsigned char *>(
+                      model_weights_data->data()),
+                    model_weights_data->length(),
+                    input_names,
+                    output_names,
+                    config,
+                    &engine);
 #endif
           } while (create_engine_status != MaceStatus::MACE_SUCCESS);
         } else {
@@ -426,11 +443,11 @@ bool RunModel(const std::string &model_name,
   }
 
   // Metrics reporting tools depends on the format, keep in consistent
-  printf("========================================\n");
-  printf("            init      warmup     run_avg\n");
-  printf("========================================\n");
-  printf("time %11.3f %11.3f %11.3f\n",
-         init_millis, warmup_millis, model_run_millis);
+  printf("========================================================\n");
+  printf("     capability(CPU)        init      warmup     run_avg\n");
+  printf("========================================================\n");
+  printf("time %15.3f %11.3f %11.3f %11.3f\n",
+         cpu_capability, init_millis, warmup_millis, model_run_millis);
 
 
   for (size_t i = 0; i < output_count; ++i) {
@@ -449,10 +466,6 @@ bool RunModel(const std::string &model_name,
               << output_size << " done.";
   }
 
-  if (model_weights_data != nullptr) {
-    MemoryUnMap(model_weights_data, model_weights_data_size);
-  }
-
   return true;
 }
 
@@ -480,13 +493,10 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
   LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
 
-  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names =
-      str_util::Split(FLAGS_output_node, ',');
-  std::vector<std::string> input_shapes =
-      str_util::Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes =
-      str_util::Split(FLAGS_output_shape, ':');
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
 
   const size_t input_count = input_shapes.size();
   const size_t output_count = output_shapes.size();
@@ -498,13 +508,30 @@ int Main(int argc, char **argv) {
   for (size_t i = 0; i < output_count; ++i) {
     ParseShape(output_shapes[i], &output_shape_vec[i]);
   }
+  std::vector<std::string> raw_input_data_formats =
+    Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+    Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+
+
+  // get cpu capability
+  Capability cpu_capability = GetCapability(DeviceType::CPU);
 
   bool ret = false;
   for (int i = 0; i < FLAGS_restart_round; ++i) {
     VLOG(0) << "restart round " << i;
-    ret =
-        RunModel(FLAGS_model_name, input_names, input_shape_vec,
-                 output_names, output_shape_vec);
+    ret = RunModel(FLAGS_model_name,
+        input_names, input_shape_vec, input_data_formats,
+        output_names, output_shape_vec, output_data_formats,
+        cpu_capability.float32_performance.exec_time);
   }
   if (ret) {
     return 0;
diff --git a/mace/utils/BUILD b/mace/utils/BUILD.bazel
similarity index 61%
rename from mace/utils/BUILD
rename to mace/utils/BUILD.bazel
index 4388e1a6628de7f738cb2a971d9a9c8f29022bd3..378210a3905e68188a8de35d2b1a8b1dacdefd39 100644
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD.bazel
@@ -10,9 +10,27 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
+    "if_android_armv7",
+    "if_neon_enabled",
     "if_openmp_enabled",
 )
 
+cc_library(
+    name = "utils_hdrs",
+    hdrs = glob([
+        "*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//mace/port:port_api",
+        "//mace/public",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = glob(
@@ -20,46 +38,47 @@ cc_library(
             "*.cc",
         ],
         exclude = [
-            "tuner_test.cc",
+            "*_test.cc",
         ],
     ),
-    hdrs = glob([
-        "*.h",
-    ]),
     copts = [
         "-Werror",
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
         "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
     ]),
     linkopts = if_android([
         "-llog",
     ]),
     deps = [
-        "//mace/public",
+        ":utils_hdrs",
     ],
+    alwayslink = 1,
 )
 
 cc_test(
-    name = "tuner_test",
+    name = "utils_test",
     testonly = 1,
-    srcs = [
-        "tuner_test.cc",
-    ],
+    srcs = glob(
+        [
+            "*_test.cc",
+        ],
+    ),
     copts = [
         "-Werror",
         "-Wextra",
         "-Wno-missing-field-initializers",
     ],
-    linkopts = ["-ldl"] + if_android([
-        "-pie",
-        "-lm",  # Required by unordered_map
-    ]),
     linkstatic = 1,
     deps = [
         ":utils",
-        "//mace/core",
+        "//mace/port",
         "@gtest//:gtest",
         "@gtest//:gtest_main",
     ],
diff --git a/mace/utils/conf_util.h b/mace/utils/conf_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4800b15e24d47c3690531a94f52214616c710624
--- /dev/null
+++ b/mace/utils/conf_util.h
@@ -0,0 +1,33 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_UTILS_CONF_UTIL_H_
+#define MACE_UTILS_CONF_UTIL_H_
+
+#include <algorithm>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace mace {
+
+inline bool EnvConfEnabled(std::string env_name) {
+  char *env = getenv(env_name.c_str());
+  return !(!env || env[0] == 0 || env[0] == '0');
+}
+
+}  // namespace mace
+
+#endif  // MACE_UTILS_CONF_UTIL_H_
diff --git a/mace/utils/detection_output.cc b/mace/utils/detection_output.cc
deleted file mode 100644
index 10a4f4f0903e65d3d11ff53051b61f4a15dc1756..0000000000000000000000000000000000000000
--- a/mace/utils/detection_output.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-struct BBox {
-  float xmin;
-  float ymin;
-  float xmax;
-  float ymax;
-  int label;
-  float confidence;
-};
-
-namespace {
-inline float overlap(const BBox &a, const BBox &b) {
-  if (a.xmin > b.xmax || a.xmax < b.xmin ||
-      a.ymin > b.ymax || a.ymax < b.ymin) {
-    return 0.f;
-  }
-  float overlap_w = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin);
-  float overlap_h = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin);
-  return overlap_w * overlap_h;
-}
-
-void NmsSortedBboxes(const std::vector<BBox> &bboxes,
-                     const float nms_threshold,
-                     const int top_k,
-                     std::vector<BBox> *sorted_boxes) {
-  const int n = std::min(top_k, static_cast<int>(bboxes.size()));
-  std::vector<int> picked;
-
-  std::vector<float> areas(n);
-#pragma omp parallel for schedule(runtime)
-  for (int i = 0; i < n; ++i) {
-    const BBox &r = bboxes[i];
-    float width = std::max(0.f, r.xmax - r.xmin);
-    float height = std::max(0.f, r.ymax - r.ymin);
-    areas[i] = width * height;
-  }
-
-  for (int i = 0; i < n; ++i) {
-    const BBox &a = bboxes[i];
-    int keep = 1;
-    for (size_t j = 0; j < picked.size(); ++j) {
-      const BBox &b = bboxes[picked[j]];
-
-      float inter_area = overlap(a, b);
-      float union_area = areas[i] + areas[picked[j]] - inter_area;
-      MACE_CHECK(union_area > 0, "union_area should be greater than 0");
-      if (inter_area / union_area > nms_threshold) {
-        keep = 0;
-        break;
-      }
-    }
-
-    if (keep) {
-      picked.push_back(i);
-      sorted_boxes->push_back(bboxes[i]);
-    }
-  }
-}
-
-inline bool cmp(const BBox &a, const BBox &b) {
-  return a.confidence > b.confidence;
-}
-}  // namespace
-
-int DetectionOutput(const float *loc_ptr,
-                    const float *conf_ptr,
-                    const float *pbox_ptr,
-                    const int num_prior,
-                    const int num_classes,
-                    const float nms_threshold,
-                    const int top_k,
-                    const int keep_top_k,
-                    const float confidence_threshold,
-                    std::vector<BBox> *bbox_rects) {
-  MACE_CHECK(keep_top_k > 0, "keep_top_k should be greater than 0");
-  std::vector<float> bboxes(4 * num_prior);
-#pragma omp parallel for schedule(runtime)
-  for (int i = 0; i < num_prior; ++i) {
-    int index = i * 4;
-    const float *lc = loc_ptr + index;
-    const float *pb = pbox_ptr + index;
-    const float *var = pb + num_prior * 4;
-
-    float pb_w = pb[2] - pb[0];
-    float pb_h = pb[3] - pb[1];
-    float pb_cx = (pb[0] + pb[2]) * 0.5f;
-    float pb_cy = (pb[1] + pb[3]) * 0.5f;
-
-    float bbox_cx = var[0] * lc[0] * pb_w + pb_cx;
-    float bbox_cy = var[1] * lc[1] * pb_h + pb_cy;
-    float bbox_w = std::exp(var[2] * lc[2]) * pb_w;
-    float bbox_h = std::exp(var[3] * lc[3]) * pb_h;
-
-    bboxes[0 + index] = bbox_cx - bbox_w * 0.5f;
-    bboxes[1 + index] = bbox_cy - bbox_h * 0.5f;
-    bboxes[2 + index] = bbox_cx + bbox_w * 0.5f;
-    bboxes[3 + index] = bbox_cy + bbox_h * 0.5f;
-  }
-  // start from 1 to ignore background class
-
-  for (int i = 1; i < num_classes; ++i) {
-    // filter by confidence threshold
-    std::vector<BBox> class_bbox_rects;
-    for (int j = 0; j < num_prior; ++j) {
-      float confidence = conf_ptr[j * num_classes + i];
-      if (confidence > confidence_threshold) {
-        BBox c = {bboxes[0 + j * 4], bboxes[1 + j * 4], bboxes[2 + j * 4],
-                  bboxes[3 + j * 4], i, confidence};
-        class_bbox_rects.push_back(c);
-      }
-    }
-    std::sort(class_bbox_rects.begin(), class_bbox_rects.end(), cmp);
-
-    // apply nms
-    std::vector<BBox> sorted_boxes;
-    NmsSortedBboxes(class_bbox_rects,
-                    nms_threshold,
-                    std::min(top_k,
-                             static_cast<int>(class_bbox_rects.size())),
-                    &sorted_boxes);
-    // gather
-    bbox_rects->insert(bbox_rects->end(), sorted_boxes.begin(),
-                       sorted_boxes.end());
-  }
-
-  std::sort(bbox_rects->begin(), bbox_rects->end(), cmp);
-
-  // output
-  int num_detected = keep_top_k < static_cast<int>(bbox_rects->size()) ?
-                     keep_top_k : static_cast<int>(bbox_rects->size());
-  bbox_rects->resize(num_detected);
-
-  return num_detected;
-}
-}  // namespace mace
diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc
deleted file mode 100644
index 8091f0a0148e8f1d68f7c88858585d51f232dd8a..0000000000000000000000000000000000000000
--- a/mace/utils/logging.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/logging.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <sstream>
-#if defined(ANDROID) || defined(__ANDROID__)
-#include <android/log.h>
-#include <iostream>
-#endif
-
-namespace mace {
-namespace logging {
-
-LogMessage::LogMessage(const char *fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
-void LogMessage::DealWithFatal() {
-  // When there is a fatal log, now we simply abort.
-  abort();
-}
-
-void LogMessage::GenerateLogMessage() {
-#if defined(ANDROID) || defined(__ANDROID__)
-  int android_log_level;
-  switch (severity_) {
-    case INFO:
-      android_log_level = ANDROID_LOG_INFO;
-      break;
-    case WARNING:
-      android_log_level = ANDROID_LOG_WARN;
-      break;
-    case ERROR:
-      android_log_level = ANDROID_LOG_ERROR;
-      break;
-    case FATAL:
-      android_log_level = ANDROID_LOG_FATAL;
-      break;
-    default:
-      if (severity_ < INFO) {
-        android_log_level = ANDROID_LOG_VERBOSE;
-      } else {
-        android_log_level = ANDROID_LOG_ERROR;
-      }
-      break;
-  }
-
-  std::stringstream ss;
-  const char *const partial_name = strrchr(fname_, '/');
-  ss << (partial_name != nullptr ? partial_name + 1 : fname_) << ":" << line_
-     << " " << str();
-  __android_log_write(android_log_level, "MACE", ss.str().c_str());
-
-  // Also log to stderr (for standalone Android apps).
-  std::cerr << "IWEF"[severity_] << " " << ss.str() << std::endl;
-#else
-  fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
-          str().c_str());
-#endif
-
-  // When there is a fatal log, terminate execution
-  if (severity_ == FATAL) {
-    DealWithFatal();
-  }
-}
-namespace {
-
-int LogLevelStrToInt(const char *mace_env_var_val) {
-  if (mace_env_var_val == nullptr) {
-    return 0;
-  }
-  // Simply use atoi here. Return 0 if convert unsuccessfully.
-  return atoi(mace_env_var_val);
-}
-
-int MinLogLevelFromEnv() {
-  // Read the min log level from env once during the first call to logging.
-  static int log_level = LogLevelStrToInt(getenv("MACE_CPP_MIN_LOG_LEVEL"));
-  return log_level;
-}
-
-int MinVLogLevelFromEnv() {
-  // Read the min vlog level from env once during the first call to logging.
-  static int vlog_level = LogLevelStrToInt(getenv("MACE_CPP_MIN_VLOG_LEVEL"));
-  return vlog_level;
-}
-
-}  // namespace
-
-LogMessage::~LogMessage() {
-  int min_log_level = MinLogLevelFromEnv();
-  if (severity_ >= min_log_level) GenerateLogMessage();
-}
-
-int LogMessage::MinVLogLevel() {
-  return MinVLogLevelFromEnv();
-}
-
-}  // namespace logging
-}  // namespace mace
diff --git a/mace/utils/logging.h b/mace/utils/logging.h
index 63d372d88b5c1241e34f8c92c7ff9b7c41d6a33e..8a5f2f8e025f1ad350a9503243dd66ad9628691f 100644
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,45 +21,19 @@
 #include <vector>
 #include <utility>
 
-#include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
+#include "mace/port/logger.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/string_util.h"
-#include "mace/utils/utils.h"
 
-#undef ERROR
 
 namespace mace {
-
-// Log severity level constants.
-const int INFO = 0;
-const int WARNING = 1;
-const int ERROR = 2;
-const int FATAL = 3;
-
-namespace logging {
-
-class LogMessage : public std::ostringstream {
- public:
-  LogMessage(const char *fname, int line, int severity);
-  ~LogMessage();
-
-  static int MinVLogLevel();
-
- private:
-  void GenerateLogMessage();
-  void DealWithFatal();
-
-  const char *fname_;
-  int line_;
-  int severity_;
-};
+namespace logging_internal {
 
 #define LOG(severity) \
-  ::mace::logging::LogMessage(__FILE__, __LINE__, mace::severity)
+  ::mace::port::Logger(__FILE__, __LINE__, mace::severity)
 
-// Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log level of VLOG.
-// Only when vlog_level <= MinVLogLevel(), it will produce output.
-#define VLOG_IS_ON(vll) ((vll) <= ::mace::logging::LogMessage::MinVLogLevel())
+#define VLOG_IS_ON(vll) (mace::ShouldGenerateVLogMessage(vll))
 #define VLOG(vll) if (VLOG_IS_ON(vll)) LOG(INFO)
 
 // MACE_CHECK/MACE_ASSERT dies with a fatal error if condition is not true.
@@ -85,17 +59,27 @@ class LogMessage : public std::ostringstream {
 template <typename T>
 T &&CheckNotNull(const char *file, int line, const char *exprtext, T &&t) {
   if (t == nullptr) {
-    ::mace::logging::LogMessage(file, line, FATAL) << std::string(exprtext);
+    ::mace::port::Logger(file, line, FATAL) << std::string(exprtext);
   }
   return std::forward<T>(t);
 }
 
 #define MACE_CHECK_NOTNULL(val) \
-  ::mace::logging::CheckNotNull(__FILE__, __LINE__, \
-                                "'" #val "' Must not be NULL", (val))
+  ::mace::logging_internal::CheckNotNull(__FILE__, __LINE__, \
+                                         "'" #val "' Must not be NULL", (val))
 
 #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
 
+#define MACE_RETURN_IF_ERROR(stmt)                           \
+  {                                                          \
+    MaceStatus status = (stmt);                              \
+    if (status != MaceStatus::MACE_SUCCESS) {                \
+      VLOG(0) << #stmt << " failed with error: "             \
+              << status.information();                       \
+      return status;                                         \
+    }                                                        \
+  }
+
 class LatencyLogger {
  public:
   LatencyLogger(int vlog_level, const std::string &message)
@@ -121,11 +105,21 @@ class LatencyLogger {
   MACE_DISABLE_COPY_AND_ASSIGN(LatencyLogger);
 };
 
-#define MACE_LATENCY_LOGGER(vlog_level, ...)              \
-  mace::logging::LatencyLogger latency_logger_##__line__( \
+#define MACE_LATENCY_LOGGER(vlog_level, ...)                                  \
+  mace::logging_internal::LatencyLogger latency_logger_##__line__(            \
       vlog_level, VLOG_IS_ON(vlog_level) ? mace::MakeString(__VA_ARGS__) : "")
 
-}  // namespace logging
+
+#ifdef MACE_ENABLE_MALLOC_LOGGING
+#define MACE_MEMORY_LOGGING_GUARD()                                      \
+  auto malloc_logger_##__line__ = port::Env::Default()->NewMallocLogger( \
+      ::mace::port::Logger(__FILE__, __LINE__, mace::INFO), \
+      std::string(__FILE__) + ":" + std::string(__func__));
+#else
+#define MACE_MEMORY_LOGGING_GUARD()
+#endif
+
+}  // namespace logging_internal
 }  // namespace mace
 
 #endif  // MACE_UTILS_LOGGING_H_
diff --git a/mace/utils/logging_test.cc b/mace/utils/logging_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a33cc96161c09f2daddac05411e3b6c269d2d5b
--- /dev/null
+++ b/mace/utils/logging_test.cc
@@ -0,0 +1,41 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/utils/logging.h"
+
+#include <gtest/gtest.h>
+
+namespace mace {
+namespace {
+
+class LoggingTest : public ::testing::Test {
+};
+
+TEST_F(LoggingTest, Basic) {
+  LOG(INFO) << "info logging";
+  LOG(WARNING) << "warning logging";
+  LOG(ERROR) << "error logging";
+
+  VLOG(1) << "vlog 1 logging";
+  VLOG(2) << "vlog 2 logging";
+}
+
+TEST_F(LoggingTest, LogFatal) {
+#ifdef GTEST_HAS_DEATH_TEST
+  EXPECT_DEATH(do { LOG(FATAL) << "fatal logging"; } while (false), "");
+#endif
+}
+
+}  // namespace
+}  // namespace mace
diff --git a/mace/core/macros.h b/mace/utils/macros.h
similarity index 61%
rename from mace/core/macros.h
rename to mace/utils/macros.h
index e90049f4764ea07654ed810e8086230dc2fc9b5b..1ce38183018b5ddb9b64a3756126cfd6426c4f68 100644
--- a/mace/core/macros.h
+++ b/mace/utils/macros.h
@@ -1,4 +1,4 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_CORE_MACROS_H_
-#define MACE_CORE_MACROS_H_
+#ifndef MACE_UTILS_MACROS_H_
+#define MACE_UTILS_MACROS_H_
+
+namespace mace {
+
+// Disable the copy and assignment operator for a class.
+#ifndef MACE_DISABLE_COPY_AND_ASSIGN
+#define MACE_DISABLE_COPY_AND_ASSIGN(CLASSNAME)     \
+  CLASSNAME(const CLASSNAME &) = delete;            \
+  CLASSNAME &operator=(const CLASSNAME &) = delete;
+#endif
+
+#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR
+#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \
+ public:                                         \
+  virtual ~CLASSNAME() {}
+#endif
+
+#define MACE_UNUSED(var) (void)(var)
+
+#define MACE_COMPUTE_KERNEL_SOURCE(...) #__VA_ARGS__
 
 // GCC can be told that a certain branch is not likely to be taken (for
 // instance, a CHECK failure), and use that information in static analysis.
@@ -27,6 +46,6 @@
 #define MACE_PREDICT_TRUE(x) (x)
 #endif
 
-#define MACE_UNUSED(var) (void)(var)
+}  // namespace mace
 
-#endif  // MACE_CORE_MACROS_H_
+#endif  // MACE_UTILS_MACROS_H_
diff --git a/mace/utils/utils.h b/mace/utils/math.h
similarity index 50%
rename from mace/utils/utils.h
rename to mace/utils/math.h
index 0b1a6992c0d6240e62516379b34eda2a313cf74f..0293806c66667d55439b6802e1a8ec3943c1635e 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/math.h
@@ -12,29 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_UTILS_UTILS_H_
-#define MACE_UTILS_UTILS_H_
+#ifndef MACE_UTILS_MATH_H_
+#define MACE_UTILS_MATH_H_
 
-#include <cstdlib>
-#include <map>
-#include <string>
-#include <vector>
+#include <cmath>
 
-namespace mace {
+#include <algorithm>
+#include <vector>
 
-// Disable the copy and assignment operator for a class.
-#ifndef MACE_DISABLE_COPY_AND_ASSIGN
-#define MACE_DISABLE_COPY_AND_ASSIGN(CLASSNAME) \
- private:                                       \
-  CLASSNAME(const CLASSNAME &) = delete;        \
-  CLASSNAME &operator=(const CLASSNAME &) = delete
-#endif
+#include "mace/utils/logging.h"
 
-#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR
-#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \
- public:                                         \
-  virtual ~CLASSNAME() {}
-#endif
+namespace mace {
 
 template <typename Integer>
 Integer RoundUp(Integer i, Integer factor) {
@@ -67,51 +55,38 @@ Integer CeilQuotient(Integer a, Integer b) {
   return (a + b - 1) / b;
 }
 
-std::string ObfuscateString(const std::string &src,
-                            const std::string &lookup_table);
-
-std::string ObfuscateString(const std::string &src);
-
-std::string ObfuscateSymbol(const std::string &src);
-
-#ifdef MACE_OBFUSCATE_LITERALS
-#define MACE_OBFUSCATE_STRING(str) ObfuscateString(str)
-#define MACE_OBFUSCATE_SYMBOL(str) ObfuscateSymbol(str)
-#else
-#define MACE_OBFUSCATE_STRING(str) (str)
-#define MACE_OBFUSCATE_SYMBOL(str) (str)
-#endif
-
-std::vector<std::string> Split(const std::string &str, char delims);
-
-bool ReadBinaryFile(std::vector<unsigned char> *data,
-                    const std::string &filename);
-
-void MemoryMap(const std::string &file,
-               const unsigned char **data,
-               size_t *size);
-
-void MemoryUnMap(const unsigned char *data,
-                 const size_t &size);
+template <typename Integer>
+inline Integer Clamp(Integer in, Integer low, Integer high) {
+  return std::max<Integer>(low, std::min<Integer>(in, high));
+}
 
 template <typename T>
-std::vector<std::string> MapKeys(const std::map<std::string, T> &data) {
-  std::vector<std::string> keys;
-  for (auto &kv : data) {
-    keys.push_back(kv.first);
+inline T ScalarSigmoid(T in) {
+  if (in > static_cast<T>(0)) {
+    return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+  } else {
+    T x = std::exp(in);
+    return x / (x + static_cast<T>(1));
   }
-  return keys;
 }
 
-inline bool EnvEnabled(std::string env_name) {
-  char *env = getenv(env_name.c_str());
-  return !(!env || env[0] == 0 || env[0] == '0');
+template <typename T>
+inline T ScalarTanh(T in) {
+  if (in > static_cast<T>(0)) {
+    T inv_expa = std::exp(-in);
+    return -static_cast<T>(1) +
+        static_cast<T>(2) / (static_cast<T>(1) + inv_expa * inv_expa);
+  } else {
+    T x = std::exp(in);
+    return x / (x + static_cast<T>(1));
+  }
 }
 
 template <typename SrcType, typename DstType>
 std::vector<DstType> TransposeShape(const std::vector<SrcType> &shape,
                                     const std::vector<int> &dst_dims) {
   size_t shape_dims = shape.size();
+  MACE_CHECK(shape_dims == dst_dims.size());
   std::vector<DstType> output_shape(shape_dims);
   for (size_t i = 0; i < shape_dims; ++i) {
     output_shape[i] = static_cast<DstType>(shape[dst_dims[i]]);
@@ -120,4 +95,5 @@ std::vector<DstType> TransposeShape(const std::vector<SrcType> &shape,
 }
 
 }  // namespace mace
-#endif  // MACE_UTILS_UTILS_H_
+
+#endif  // MACE_UTILS_MATH_H_
diff --git a/mace/utils/memory.h b/mace/utils/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..41a898ef48fd712ce65191f967565531a4afdd89
--- /dev/null
+++ b/mace/utils/memory.h
@@ -0,0 +1,74 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_UTILS_MEMORY_H_
+#define MACE_UTILS_MEMORY_H_
+
+#include <memory>
+#include <utility>
+
+namespace mace {
+
+namespace memory_internal {
+
+// Traits to select proper overload and return type for `make_unique<>`.
+template <typename T>
+struct MakeUniqueResult {
+  using scalar = std::unique_ptr<T>;
+};
+template <typename T>
+struct MakeUniqueResult<T[]> {
+  using array = std::unique_ptr<T[]>;
+};
+template <typename T, size_t N>
+struct MakeUniqueResult<T[N]> {
+  using invalid = void;
+};
+
+}  // namespace memory_internal
+
+// gcc 4.8 has __cplusplus at 201301 but doesn't define make_unique.  Other
+// supported compilers either just define __cplusplus as 201103 but have
+// make_unique (msvc), or have make_unique whenever __cplusplus > 201103 (clang)
+#if (__cplusplus > 201103L || defined(_MSC_VER)) && \
+    !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8)
+using std::make_unique;
+#else
+
+// `make_unique` overload for non-array types.
+template <typename T, typename... Args>
+typename memory_internal::MakeUniqueResult<T>::scalar make_unique(
+    Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+// `make_unique` overload for an array T[] of unknown bounds.
+// The array allocation needs to use the `new T[size]` form and cannot take
+// element constructor arguments. The `std::unique_ptr` will manage destructing
+// these array elements.
+template <typename T>
+typename memory_internal::MakeUniqueResult<T>::array make_unique(size_t n) {
+  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+}
+
+// `make_unique` overload for an array T[N] of known bounds.
+// This construction will be rejected.
+template <typename T, typename... Args>
+typename memory_internal::MakeUniqueResult<T>::invalid make_unique(
+    Args&&... /* args */) = delete;
+#endif
+
+}  // namespace mace
+
+#endif  // MACE_UTILS_MEMORY_H_
diff --git a/mace/utils/memory_logging.h b/mace/utils/memory_logging.h
deleted file mode 100644
index 4e3cd5883b749b8f1d49d5f8d6ec886d8f65a78b..0000000000000000000000000000000000000000
--- a/mace/utils/memory_logging.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_UTILS_MEMORY_LOGGING_H_
-#define MACE_UTILS_MEMORY_LOGGING_H_
-
-#ifndef __hexagon__
-#include <malloc.h>
-#endif
-#include <string>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-#ifdef MACE_ENABLE_MEMORY_LOGGING
-class MallinfoChangeLogger {
- public:
-  explicit MallinfoChangeLogger(const std::string &name) : name_(name) {
-    prev_ = mallinfo();
-  }
-  ~MallinfoChangeLogger() {
-    struct mallinfo curr = mallinfo();
-    LogMallinfoChange(name_, curr, prev_);
-  }
-
- private:
-  const std::string name_;
-  struct mallinfo prev_;
-
-  struct mallinfo LogMallinfoChange(const std::string &name,
-                                    const struct mallinfo curr,
-                                    const struct mallinfo prev) {
-    if (prev.arena != curr.arena) {
-      LOG(INFO) << "[" << name << "] "
-                << "Non-mmapped space allocated (bytes): " << curr.arena
-                << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
-    }
-    if (prev.ordblks != curr.ordblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Number of free chunks: " << curr.ordblks << ", diff: "
-                << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
-    }
-    if (prev.smblks != curr.smblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Number of free fastbin blocks: " << curr.smblks
-                << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
-    }
-    if (prev.hblks != curr.hblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Number of mmapped regions: " << curr.hblks
-                << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
-    }
-    if (prev.hblkhd != curr.hblkhd) {
-      LOG(INFO) << "[" << name << "] "
-                << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
-                << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
-    }
-    if (prev.usmblks != curr.usmblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Maximum total allocated space (bytes): " << curr.usmblks
-                << ", diff: "
-                << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
-    }
-    if (prev.fsmblks != curr.fsmblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
-                << ", diff: "
-                << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
-    }
-    if (prev.uordblks != curr.uordblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Total allocated space (bytes): " << curr.uordblks
-                << ", diff: "
-                << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
-    }
-    if (prev.fordblks != curr.fordblks) {
-      LOG(INFO) << "[" << name << "] "
-                << "Total free space (bytes): " << curr.fordblks << ", diff: "
-                << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
-    }
-    if (prev.keepcost != curr.keepcost) {
-      LOG(INFO) << "[" << name << "] "
-                << "Top-most, releasable space (bytes): " << curr.keepcost
-                << ", diff: "
-                << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
-    }
-    return curr;
-  }
-};
-
-#define MACE_MEMORY_LOGGING_GUARD()                                        \
-  MallinfoChangeLogger mem_logger_##__line__(std::string(__FILE__) + ":" + \
-                                             std::string(__func__));
-#else
-#define MACE_MEMORY_LOGGING_GUARD()
-#endif
-
-}  // namespace mace
-
-#endif  // MACE_UTILS_MEMORY_LOGGING_H_
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
index 81d820cbfc39b2fe9edb729071d351c1993b1b01..7634833cc1e75763d79901f68b47f46705fa97db 100644
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -19,6 +19,12 @@
 #include <cmath>
 #include <limits>
 
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif  // MACE_ENABLE_NEON
+
+#include "mace/utils/logging.h"
+
 namespace mace {
 
 template<typename T>
@@ -123,6 +129,25 @@ inline void Quantize(const float *input,
   QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
 }
 
+template<typename T>
+inline void Quantize(const Tensor &input,
+                     Tensor *output,
+                     float *min_out,
+                     float *max_out) {
+  MACE_CHECK(input.size() != 0);
+  Tensor::MappingGuard input_guard(&input);
+  Tensor::MappingGuard output_guard(output);
+  auto *input_data = input.data<float>();
+  auto *output_data = output->mutable_data<T>();
+  float scale;
+  int32_t zero_point;
+
+  Quantize(input_data, input.size(), false, output_data, &scale, &zero_point);
+
+  *min_out = scale * (std::numeric_limits<T>::lowest() - zero_point);
+  *max_out = scale * (std::numeric_limits<T>::max() - zero_point);
+}
+
 template<typename T>
 inline void Dequantize(const T *input,
                        const index_t size,
@@ -135,14 +160,127 @@ inline void Dequantize(const T *input,
   }
 }
 
-inline void QuantizeMultiplier(double multiplier,
-                               int32_t* output_multiplier,
-                               int32_t* shift) {
-  if (multiplier == 0.f) {
-    *output_multiplier = 0;
-    *shift = 0;
-    return;
+#if defined(MACE_ENABLE_NEON)
+template<>
+inline void QuantizeWithScaleAndZeropoint<uint8_t>(const float *input,
+                                                   const index_t size,
+                                                   float scale,
+                                                   int32_t zero_point,
+                                                   uint8_t *output) {
+  const float32x4_t vround = vdupq_n_f32(0.5);
+  const float32x4_t
+      vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point)));
+  const float recip_scale = 1.f / scale;
+  const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale);
+  const index_t block_count = size / 16;
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    float32x4_t vi0 = vld1q_f32(input + i * 16);
+    float32x4_t vi1 = vld1q_f32(input + i * 16 + 4);
+    float32x4_t vi2 = vld1q_f32(input + i * 16 + 8);
+    float32x4_t vi3 = vld1q_f32(input + i * 16 + 12);
+
+    int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale));
+    int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale));
+    int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale));
+    int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale));
+
+    uint8x8_t vo0_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32)));
+    uint8x8_t vo1_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32)));
+    uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8);
+
+    vst1q_u8(output + i * 16, vo);
+  }
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = Saturate<uint8_t>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<>
+inline void Dequantize<int32_t>(const int32_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 4;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    int32x4_t vi = vld1q_s32(input + i * 4);
+    float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero)));
+    vst1q_f32(output + i * 4, vo);
+  }
+  for (index_t i = block_count * 4; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+template<>
+inline void Dequantize<uint8_t>(const uint8_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 16;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    uint8x16_t vi = vld1q_u8(input + i * 16);
+    float32x4x4_t vo = {
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+    };
+    vst1q_f32(output + i * 16, vo.val[0]);
+    vst1q_f32(output + i * 16 + 4, vo.val[1]);
+    vst1q_f32(output + i * 16 + 8, vo.val[2]);
+    vst1q_f32(output + i * 16 + 12, vo.val[3]);
   }
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+#endif  // MACE_ENABLE_NEON
+
+template<typename T>
+inline void DeQuantize(const Tensor &input,
+                       const float min_in,
+                       const float max_in,
+                       Tensor *output) {
+  MACE_CHECK(input.size() != 0);
+  Tensor::MappingGuard input_guard(&input);
+  Tensor::MappingGuard output_guard(output);
+  auto *input_data = input.data<T>();
+  auto *output_data = output->mutable_data<float>();
+  float scale;
+  int32_t zero_point;
+
+  AdjustRange<T>(min_in, max_in, false, &scale, &zero_point);
+
+  Dequantize(input_data, input.size(), scale, zero_point, output_data);
+}
+
+inline void QuantizeMultiplier(double multiplier,
+                               int32_t *output_multiplier,
+                               int32_t *shift) {
   const double q = std::frexp(multiplier, shift);
   auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
   if (qint == (1ll << 31)) {
diff --git a/mace/utils/rwlock.h b/mace/utils/rwlock.h
index c15fa5ad7a605ce1dc0d7b2fabb083a4aba53e7f..b4d6392ce3772fac468b46f450faa89839c8e5f6 100644
--- a/mace/utils/rwlock.h
+++ b/mace/utils/rwlock.h
@@ -17,7 +17,9 @@
 
 #include <condition_variable>  // NOLINT(build/c++11)
 #include <mutex>  // NOLINT(build/c++11)
+
 #include "mace/utils/logging.h"
+#include "mace/utils/macros.h"
 
 namespace mace {
 namespace utils {
@@ -26,10 +28,6 @@ class RWMutex {
  public:
   RWMutex() : counter_(0), waiting_readers_(0), waiting_writers_(0) {}
   ~RWMutex() = default;
-  RWMutex(const RWMutex &) = delete;
-  RWMutex(RWMutex &&) = delete;
-  RWMutex& operator=(const RWMutex &) = delete;
-  RWMutex& operator=(RWMutex &&) = delete;
 
   int counter_;  // -1 for writer, 0 for nobody, 1~n for reader
   int waiting_readers_;
@@ -37,6 +35,8 @@ class RWMutex {
   std::mutex mutex_;
   std::condition_variable reader_cv_;
   std::condition_variable writer_cv_;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(RWMutex);
 };
 
 // Writer first
@@ -61,13 +61,11 @@ class ReadLock {
       }
     }
   }
-  ReadLock(const ReadLock &) = delete;
-  ReadLock(ReadLock &&) = delete;
-  ReadLock& operator=(const ReadLock &) = delete;
-  ReadLock& operator=(ReadLock &&) = delete;
 
  private:
   RWMutex *rw_mutex_;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(ReadLock);
 };
 
 class WriteLock {
@@ -91,13 +89,11 @@ class WriteLock {
       rw_mutex_->reader_cv_.notify_all();
     }
   }
-  WriteLock(const WriteLock &) = delete;
-  WriteLock(WriteLock &&) = delete;
-  WriteLock& operator=(const WriteLock &) = delete;
-  WriteLock& operator=(WriteLock &&) = delete;
 
  private:
   RWMutex *rw_mutex_;
+
+  MACE_DISABLE_COPY_AND_ASSIGN(WriteLock);
 };
 
 }  // namespace utils
diff --git a/mace/utils/stl_util.h b/mace/utils/stl_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..44dd1d8e384b7bfa260e12b9f33183a5ec5b7157
--- /dev/null
+++ b/mace/utils/stl_util.h
@@ -0,0 +1,37 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_UTILS_STL_UTIL_H_
+#define MACE_UTILS_STL_UTIL_H_
+
+#include <algorithm>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace mace {
+
+template <typename T>
+std::vector<std::string> MapKeys(const std::map<std::string, T> &data) {
+  std::vector<std::string> keys;
+  for (auto &kv : data) {
+    keys.push_back(kv.first);
+  }
+  return keys;
+}
+
+}  // namespace mace
+
+#endif  // MACE_UTILS_STL_UTIL_H_
diff --git a/mace/utils/string_util.cc b/mace/utils/string_util.cc
index 3492706fe068f3caef8ce9443b505f887fb97ab6..8114e3aad7364c20a2d14b75912d1d798df24263 100644
--- a/mace/utils/string_util.cc
+++ b/mace/utils/string_util.cc
@@ -83,4 +83,65 @@ std::string StringFormatter::Table(
 }
 
 }  // namespace string_util
+
+std::string ObfuscateString(const std::string &src,
+                            const std::string &lookup_table) {
+  std::string dest;
+  dest.resize(src.size());
+  for (size_t i = 0; i < src.size(); i++) {
+    dest[i] = src[i] ^ lookup_table[i % lookup_table.size()];
+  }
+  return dest;
+}
+
+// ObfuscateString(ObfuscateString(str)) ==> str
+std::string ObfuscateString(const std::string &src) {
+  // Keep consistent with obfuscation in python tools
+  return ObfuscateString(src, "Mobile-AI-Compute-Engine");
+}
+
+// Obfuscate synbol or path string
+std::string ObfuscateSymbol(const std::string &src) {
+  std::string dest = src;
+  if (dest.empty()) {
+    return dest;
+  }
+  dest[0] = src[0];  // avoid invalid symbol which starts from 0-9
+  const std::string encode_dict =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
+  for (size_t i = 1; i < src.size(); i++) {
+    char ch = src[i];
+    int idx;
+    if (ch >= '0' && ch <= '9') {
+      idx = ch - '0';
+    } else if (ch >= 'a' && ch <= 'z') {
+      idx = 10 + ch - 'a';
+    } else if (ch >= 'A' && ch <= 'Z') {
+      idx = 10 + 26 + ch - 'a';
+    } else if (ch == '_') {
+      idx = 10 + 26 + 26;
+    } else {
+      dest[i] = ch;
+      continue;
+    }
+    // There is no collision if it's true for every char at every position
+    dest[i] = encode_dict[(idx + i + 31) % encode_dict.size()];
+  }
+  return dest;
+}
+
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
 }  // namespace mace
diff --git a/mace/utils/string_util.h b/mace/utils/string_util.h
index c41aaaa12fdb682bc7aea2d08076b867ad8615f0..c9df13566335b4041aca30d6a6f4e911434bb0d4 100644
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
@@ -15,6 +15,7 @@
 #ifndef MACE_UTILS_STRING_UTIL_H_
 #define MACE_UTILS_STRING_UTIL_H_
 
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -80,6 +81,35 @@ inline std::string MakeString(const std::string &str) {
 
 inline std::string MakeString(const char *c_str) { return std::string(c_str); }
 
+inline std::string ToLower(const std::string &src) {
+  std::string dest(src);
+  std::transform(src.begin(), src.end(), dest.begin(), ::tolower);
+  return dest;
+}
+
+inline std::string ToUpper(const std::string &src) {
+  std::string dest(src);
+  std::transform(src.begin(), src.end(), dest.begin(), ::toupper);
+  return dest;
+}
+
+std::string ObfuscateString(const std::string &src,
+                            const std::string &lookup_table);
+
+std::string ObfuscateString(const std::string &src);
+
+std::string ObfuscateSymbol(const std::string &src);
+
+#ifdef MACE_OBFUSCATE_LITERALS
+#define MACE_OBFUSCATE_STRING(str) ObfuscateString(str)
+#define MACE_OBFUSCATE_SYMBOL(str) ObfuscateSymbol(str)
+#else
+#define MACE_OBFUSCATE_STRING(str) (str)
+#define MACE_OBFUSCATE_SYMBOL(str) (str)
+#endif
+
+std::vector<std::string> Split(const std::string &str, char delims);
+
 }  // namespace mace
 
 #endif  // MACE_UTILS_STRING_UTIL_H_
diff --git a/mace/utils/string_util_test.cc b/mace/utils/string_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84b9b0a671a5655d3f9660e859d5ff0ed56b9f3a
--- /dev/null
+++ b/mace/utils/string_util_test.cc
@@ -0,0 +1,40 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/utils/string_util.h"
+
+#include <gtest/gtest.h>
+
+namespace mace {
+namespace {
+
+class StringUtilTest : public ::testing::Test {
+};
+
+TEST_F(StringUtilTest, MakeString) {
+  EXPECT_EQ("Hello 2019", MakeString("Hello", " ", 2019));
+}
+
+TEST_F(StringUtilTest, ToLower) {
+  EXPECT_EQ("", ToLower(""));
+  EXPECT_EQ("hello world!", ToLower("Hello World!"));
+}
+
+TEST_F(StringUtilTest, ToUpper) {
+  EXPECT_EQ("", ToLower(""));
+  EXPECT_EQ("HELLO WORLD!", ToUpper("Hello World!"));
+}
+
+}  // namespace
+}  // namespace mace
diff --git a/mace/utils/timer.h b/mace/utils/timer.h
index 3f0e96f4c37045ecd7c9b9a274a6fbf7dc0a0380..0955af7ba5ce5db65e4493b486e028683f5d1e66 100644
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -15,7 +15,7 @@
 #ifndef MACE_UTILS_TIMER_H_
 #define MACE_UTILS_TIMER_H_
 
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index 7ac8467bce6cf4df2c3c6c4741cf6b630497074d..5d381b048a68ee9c728b656e8efdcd72d6971d5a 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -14,12 +14,14 @@
 
 #ifndef MACE_UTILS_TUNER_H_
 #define MACE_UTILS_TUNER_H_
+
+// TODO(heliangliang) Fix portability
 #include <fcntl.h>
-#include <stdlib.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <functional>
@@ -30,8 +32,8 @@
 #include <vector>
 
 #include "mace/utils/logging.h"
+#include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
-#include "mace/utils/utils.h"
 
 namespace mace {
 
@@ -76,16 +78,14 @@ class Tuner {
       std::vector<param_type> opt_param = default_param;
       RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
       VLOG(3) << "Tuning " << param_key
-              << " retult: " << (VLOG_IS_ON(3) ? MakeString(opt_param) : "");
+              << " retult: " << MakeString(opt_param);
       param_table_[obfucated_param_key] = opt_param;
       return res;
     } else {
       // run
       if (param_table_.find(obfucated_param_key) != param_table_.end()) {
         VLOG(3) << param_key << ": "
-                << (VLOG_IS_ON(3)
-                        ? MakeString(param_table_[obfucated_param_key])
-                        : "");
+                << MakeString(param_table_[obfucated_param_key]);
         return func(param_table_[obfucated_param_key], nullptr, nullptr);
       } else {
         return func(default_param, nullptr, nullptr);
@@ -112,7 +112,7 @@ class Tuner {
                     sizeof(params_size));
 
           VLOG(3) << "Write tuning param: " << kp.first.c_str() << ": "
-                  << (VLOG_IS_ON(3) ? MakeString(params) : "");
+                  << MakeString(params);
           for (auto &param : params) {
             ofs.write(reinterpret_cast<char *>(&param), sizeof(params_size));
           }
@@ -293,4 +293,5 @@ class Tuner {
 };
 
 }  // namespace mace
+
 #endif  // MACE_UTILS_TUNER_H_
diff --git a/mace/utils/utils.cc b/mace/utils/utils.cc
deleted file mode 100644
index a422988d1689353a720e19ca544859dd5f952a68..0000000000000000000000000000000000000000
--- a/mace/utils/utils.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/utils.h"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <cerrno>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-std::string ObfuscateString(const std::string &src,
-                            const std::string &lookup_table) {
-  std::string dest;
-  dest.resize(src.size());
-  for (size_t i = 0; i < src.size(); i++) {
-    dest[i] = src[i] ^ lookup_table[i % lookup_table.size()];
-  }
-  return dest;
-}
-
-// ObfuscateString(ObfuscateString(str)) ==> str
-std::string ObfuscateString(const std::string &src) {
-  // Keep consistent with obfuscation in python tools
-  return ObfuscateString(src, "Mobile-AI-Compute-Engine");
-}
-
-// Obfuscate synbol or path string
-std::string ObfuscateSymbol(const std::string &src) {
-  std::string dest = src;
-  if (dest.empty()) {
-    return dest;
-  }
-  dest[0] = src[0];  // avoid invalid symbol which starts from 0-9
-  const std::string encode_dict =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
-  for (size_t i = 1; i < src.size(); i++) {
-    char ch = src[i];
-    int idx;
-    if (ch >= '0' && ch <= '9') {
-      idx = ch - '0';
-    } else if (ch >= 'a' && ch <= 'z') {
-      idx = 10 + ch - 'a';
-    } else if (ch >= 'A' && ch <= 'Z') {
-      idx = 10 + 26 + ch - 'a';
-    } else if (ch == '_') {
-      idx = 10 + 26 + 26;
-    } else {
-      dest[i] = ch;
-      continue;
-    }
-    // There is no collision if it's true for every char at every position
-    dest[i] = encode_dict[(idx + i + 31) % encode_dict.size()];
-  }
-  return dest;
-}
-
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-
-bool ReadBinaryFile(std::vector<unsigned char> *data,
-                    const std::string &filename) {
-  std::ifstream ifs(filename, std::ios::in | std::ios::binary);
-  if (!ifs.is_open()) {
-    return false;
-  }
-  ifs.seekg(0, ifs.end);
-  size_t length = ifs.tellg();
-  ifs.seekg(0, ifs.beg);
-
-  data->resize(length);
-  ifs.read(reinterpret_cast<char *>(data->data()), length);
-
-  if (ifs.fail()) {
-    return false;
-  }
-  ifs.close();
-
-  return true;
-}
-
-void MemoryMap(const std::string &file,
-               const unsigned char **data,
-               size_t *size) {
-  int fd = open(file.c_str(), O_RDONLY);
-  MACE_CHECK(fd >= 0,
-             "Failed to open file ", file, ", error code: ", strerror(errno));
-  struct stat st;
-  fstat(fd, &st);
-  *size = static_cast<size_t>(st.st_size);
-
-  *data = static_cast<const unsigned char *>(
-      mmap(nullptr, *size, PROT_READ, MAP_PRIVATE, fd, 0));
-  MACE_CHECK(*data != static_cast<const unsigned char *>(MAP_FAILED),
-             "Failed to map file ", file, ", error code: ", strerror(errno));
-
-  int ret = close(fd);
-  MACE_CHECK(ret == 0,
-             "Failed to close file ", file, ", error code: ", strerror(errno));
-}
-
-void MemoryUnMap(const unsigned char *data,
-                 const size_t &size) {
-  MACE_CHECK(data != nullptr && size > 0, "data is null or size is 0");
-
-  int ret = munmap(const_cast<unsigned char *>(data), size);
-
-  MACE_CHECK(ret == 0,
-             "Failed to unmap file, error code: ", strerror(errno));
-}
-
-}  // namespace mace
diff --git a/repository/git/BUILD b/repository/git/BUILD.bazel
similarity index 100%
rename from repository/git/BUILD
rename to repository/git/BUILD.bazel
diff --git a/repository/git/BUILD.tpl b/repository/git/BUILD.bazel.tpl
similarity index 100%
rename from repository/git/BUILD.tpl
rename to repository/git/BUILD.bazel.tpl
diff --git a/repository/git/git_configure.bzl b/repository/git/git_configure.bzl
index ca2b8b2d5d9d158554bb32933a2b9a825081a3bd..aa1ea598970b60b4f3a0b8d79d6e35cf282565e9 100644
--- a/repository/git/git_configure.bzl
+++ b/repository/git/git_configure.bzl
@@ -2,10 +2,10 @@
 """
 def _git_version_conf_impl(repository_ctx):
   repository_ctx.template(
-      "BUILD",
-      Label("//repository/git:BUILD.tpl"))
+      "BUILD.bazel",
+      Label("//repository/git:BUILD.bazel.tpl"))
 
-  mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")]
+  mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD.bazel")))[:-len("BUILD.bazel")]
 
   generated_files_path = repository_ctx.path("gen")
 
diff --git a/repository/opencl-kernel/BUILD b/repository/opencl-kernel/BUILD.bazel
similarity index 100%
rename from repository/opencl-kernel/BUILD
rename to repository/opencl-kernel/BUILD.bazel
diff --git a/repository/opencl-kernel/BUILD.tpl b/repository/opencl-kernel/BUILD.bazel.tpl
similarity index 100%
rename from repository/opencl-kernel/BUILD.tpl
rename to repository/opencl-kernel/BUILD.bazel.tpl
diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl
index 88c0880e1bc68a9cd6b4308eb3426e14166d0769..572219b161bf496b68c0949da53c6820554f13c9 100644
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -3,11 +3,11 @@
 
 def _opencl_encrypt_kernel_impl(repository_ctx):
     repository_ctx.template(
-        "BUILD",
-        Label("//repository/opencl-kernel:BUILD.tpl"),
+        "BUILD.bazel",
+        Label("//repository/opencl-kernel:BUILD.bazel.tpl"),
     )
 
-    mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")]
+    mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD.bazel")))[:-len("BUILD.bazel")]
     generated_files_path = repository_ctx.path("gen")
 
     ret = repository_ctx.execute(
diff --git a/third_party/caffe/BUILD b/third_party/caffe/BUILD.bazel
similarity index 100%
rename from third_party/caffe/BUILD
rename to third_party/caffe/BUILD.bazel
diff --git a/third_party/caffe/caffe.proto b/third_party/caffe/caffe.proto
index b2d56b9898fbcfd0bbd31d7d1356aea12ce87445..c972c9f66bd27c8145b919da2778d40668cf50ff 100644
--- a/third_party/caffe/caffe.proto
+++ b/third_party/caffe/caffe.proto
@@ -515,6 +515,7 @@ message LayerParameter {
   optional InfogainLossParameter infogain_loss_param = 116;
   optional InnerProductParameter inner_product_param = 117;
   optional InputParameter input_param = 143;
+  optional InterpParameter interp_param = 147;
   optional LogParameter log_param = 134;
   optional LRNParameter lrn_param = 118;
   optional MemoryDataParameter memory_data_param = 119;
@@ -1207,6 +1208,15 @@ message InputParameter {
   repeated BlobShape shape = 1;
 }
 
+message InterpParameter {
+  optional int32 height = 1 [default = 0]; // Height of output
+  optional int32 width = 2 [default = 0]; // Width of output
+  optional int32 zoom_factor = 3 [default = 1]; // zoom factor
+  optional int32 shrink_factor = 4 [default = 1]; // shrink factor
+  optional int32 pad_beg = 5 [default = 0]; // padding at begin of input
+  optional int32 pad_end = 6 [default = 0]; // padding at end of input
+}
+
 // Message that stores parameters used by LogLayer
 message LogParameter {
   // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD.bazel
similarity index 100%
rename from third_party/eigen3/BUILD
rename to third_party/eigen3/BUILD.bazel
diff --git a/third_party/hta/BUILD b/third_party/hta/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7385472755eab0a1fb75df4bb089a63aa01e110e
--- /dev/null
+++ b/third_party/hta/BUILD
@@ -0,0 +1,31 @@
+# These files are generated fron nnlib project
+
+licenses(["notice"])
+
+exports_files(["license.txt"])
+
+load(
+    "//mace:mace.bzl",
+    "if_android_armv7",
+    "if_android_arm64",
+)
+
+cc_library(
+    name = "hta",
+    srcs = if_android_armv7([
+        "armeabi-v7a/libhta_controller.so",
+        "armeabi-v7a/libhta_hexagon_runtime.so",
+        "armeabi-v7a/libnpu.so",
+    ]) + if_android_arm64([
+        "arm64-v8a/libcdsprpc.so",
+        "arm64-v8a/libhta_controller.so",
+        "arm64-v8a/libhta_hexagon_runtime.so",
+        "arm64-v8a/libnpu.so",
+    ]),
+    hdrs = [
+        "hta_hexagon_api.h",
+        "hta_hexagon_nn_ops.h",
+        "hta_ops.h",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/hta/arm64-v8a/libcdsprpc.so b/third_party/hta/arm64-v8a/libcdsprpc.so
new file mode 100755
index 0000000000000000000000000000000000000000..57de01f4887197b0b510f395f828289d74597069
Binary files /dev/null and b/third_party/hta/arm64-v8a/libcdsprpc.so differ
diff --git a/third_party/hta/arm64-v8a/libhta_controller.so b/third_party/hta/arm64-v8a/libhta_controller.so
new file mode 100644
index 0000000000000000000000000000000000000000..3cb5ea31a24d319779521454720c3b587120d2e0
Binary files /dev/null and b/third_party/hta/arm64-v8a/libhta_controller.so differ
diff --git a/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so
new file mode 100644
index 0000000000000000000000000000000000000000..32b5d784a19a6390ffe25f4c4e4853172b4d5074
Binary files /dev/null and b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so differ
diff --git a/third_party/hta/arm64-v8a/libnpu.so b/third_party/hta/arm64-v8a/libnpu.so
new file mode 100644
index 0000000000000000000000000000000000000000..9b6633769db106f516ac7cfebea0b40b491996e1
Binary files /dev/null and b/third_party/hta/arm64-v8a/libnpu.so differ
diff --git a/third_party/hta/armeabi-v7a/libhta_controller.so b/third_party/hta/armeabi-v7a/libhta_controller.so
new file mode 100644
index 0000000000000000000000000000000000000000..03b267889d96e74b965fd485313d35ce59b8bc97
Binary files /dev/null and b/third_party/hta/armeabi-v7a/libhta_controller.so differ
diff --git a/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so
new file mode 100644
index 0000000000000000000000000000000000000000..9136f520d74901ca068c5377eccb578978ca9fa6
Binary files /dev/null and b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so differ
diff --git a/third_party/hta/armeabi-v7a/libnpu.so b/third_party/hta/armeabi-v7a/libnpu.so
new file mode 100644
index 0000000000000000000000000000000000000000..a88605929cfdca12ecd720749064d880a6d48ab4
Binary files /dev/null and b/third_party/hta/armeabi-v7a/libnpu.so differ
diff --git a/third_party/hta/hta_hexagon_api.h b/third_party/hta/hta_hexagon_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb13fe62bcd8bbdcb8f50f4dfb725df292aa87fd
--- /dev/null
+++ b/third_party/hta/hta_hexagon_api.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef THIRD_PARTY_HTA_HEXAGON_API_H_
+#define THIRD_PARTY_HTA_HEXAGON_API_H_
+
+#include "hta_hexagon_nn_ops.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int hexagon_hta_nn_nn_id;
+
+struct input {
+	uint32_t src_id;
+	uint32_t output_idx;
+};
+
+#define NODE_ID_RESERVED_CONSTANT 0
+
+#define MAX_DIMENSIONS 8
+struct output {
+	uint32_t rank; // dimensions in the tensor
+	uint32_t max_sizes[MAX_DIMENSIONS]; // max num elements in each dimension
+	uint32_t elementsize; // size of each element
+	int32_t zero_offset; // 0 for float / integer values
+	float stepsize; // 0 for float/integer values
+};
+
+struct perfinfo {
+	uint32_t node_id;
+	uint32_t executions;
+	union {
+		uint64_t counter;
+		struct {
+			uint32_t counter_lo;
+			uint32_t counter_hi;
+		};
+	};
+};
+
+typedef struct input hexagon_hta_nn_input;
+typedef struct output hexagon_hta_nn_output;
+typedef struct perfinfo hexagon_hta_nn_perfinfo;
+typedef int32_t hexagon_hta_nn_padding_type;
+
+typedef enum padding_type_enum {
+	HTA_NN_PAD_NA = 0,
+	HTA_NN_PAD_SAME,
+	HTA_NN_PAD_VALID,
+	HTA_NN_PAD_MIRROR_REFLECT,
+	HTA_NN_PAD_MIRROR_SYMMETRIC,
+	HTA_NN_PAD_SAME_CAFFE,
+} hta_padding_type;
+
+typedef struct {
+	unsigned int batches;
+	unsigned int height;
+	unsigned int width;
+	unsigned int depth;
+	unsigned char *data;
+	int dataLen;		/* For input and output */
+	unsigned int data_valid_len; /* for output only */
+	unsigned int unused;
+} hexagon_hta_nn_tensordef;
+
+typedef struct hexagon_nn_op_node hexagon_nn_op_node;
+struct hexagon_nn_op_node {
+  unsigned int node_id;
+  hta_op_type operation;
+  hta_padding_type padding;
+  hexagon_hta_nn_input* inputs;
+  int inputsLen;
+  hexagon_hta_nn_output* outputs;
+  int outputsLen;
+};
+typedef struct hexagon_nn_const_node hexagon_nn_const_node;
+struct hexagon_nn_const_node {
+  unsigned int node_id;
+  hexagon_hta_nn_tensordef tensor;
+};
+
+/* Actual functions in the interface */
+/* Returns 0 on success, nonzero on error unless otherwise noted */
+/* Configure the hardware and software environment.  Should be called once before doing anything */
+int hexagon_hta_nn_config( void );
+
+/* Initialize a new graph, returns a new nn_id or -1 on error */
+int hexagon_hta_nn_init(hexagon_hta_nn_nn_id *g);
+
+/* Set debug verbosity.  Default is 0, higher values are more verbose */
+int hexagon_hta_nn_set_debug_level(hexagon_hta_nn_nn_id id, int level);
+
+/* Append a node to the graph.  Nodes are executed in the appended order. */
+int hexagon_hta_nn_append_node(
+	hexagon_hta_nn_nn_id id,
+	uint32_t node_id,
+	hta_op_type operation,
+	hta_padding_type padding,
+	const struct input *inputs,
+	uint32_t num_inputs,
+	const struct output *outputs,
+	uint32_t num_outputs);
+
+/*
+ * Append a const node into the graph.  The data is copied locally during this
+ * call, the caller does not need it to persist.
+ */
+int hexagon_hta_nn_append_const_node(
+	hexagon_hta_nn_nn_id id,
+	uint32_t node_id,
+	uint32_t batches,
+	uint32_t height,
+	uint32_t width,
+	uint32_t depth,
+	const uint8_t *data,
+	uint32_t data_len);
+
+/*
+ * Prepare a graph for execution.  Must be done before attempting to execute the graph.
+ */
+int hexagon_hta_nn_prepare(hexagon_hta_nn_nn_id id);
+
+/* Execute the graph with a single input and a single output. */
+int hexagon_hta_nn_execute(
+	hexagon_hta_nn_nn_id id,
+	uint32_t batches_in,
+	uint32_t height_in,
+	uint32_t width_in,
+	uint32_t depth_in,
+	const uint8_t *data_in,
+	uint32_t data_len_in,
+	uint32_t *batches_out,
+	uint32_t *height_out,
+	uint32_t *width_out,
+	uint32_t *depth_out,
+	uint8_t *data_out,
+	uint32_t data_out_max,
+	uint32_t *data_out_size);
+
+/* Tear down a graph, destroying it and freeing resources.  */
+int hexagon_hta_nn_teardown(hexagon_hta_nn_nn_id id);
+
+/* Get the version of the library */
+int hexagon_hta_nn_version(int *ver);
+
+/* Execute the graph with a multiple input and a multiple output. */
+int hexagon_hta_nn_execute_new(
+	hexagon_hta_nn_nn_id id,
+	const hexagon_hta_nn_tensordef *inputs,
+	uint32_t n_inputs,
+	hexagon_hta_nn_tensordef *outputs,
+	uint32_t n_outputs);
+
+int hexagon_hta_nn_serialize_size(hexagon_hta_nn_nn_id id, unsigned int *serialized_obj_size_out);
+int hexagon_hta_nn_serialize(hexagon_hta_nn_nn_id id, void *buf, unsigned int buf_len);
+int hexagon_hta_nn_deserialize(void *buf, unsigned len, hexagon_hta_nn_nn_id *g);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //THIRD_PARTY_HTA_HEXAGON_API_H_
diff --git a/third_party/hta/hta_hexagon_nn_ops.h b/third_party/hta/hta_hexagon_nn_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2aaa5881c842d12892d21dead102efad08df270
--- /dev/null
+++ b/third_party/hta/hta_hexagon_nn_ops.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
+#define THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
+
+typedef enum hta_op_type_enum {
+#define HTA_DEF_OP(NAME, ...) HTA_OP_##NAME,
+
+#include "hta_ops.h"
+  HTA_NN_OPS_MAX
+
+#undef HTA_DEF_OP
+} hta_op_type;
+
+#endif  // THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_
diff --git a/third_party/hta/hta_ops.h b/third_party/hta/hta_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3becf1d3a79534131a8cfb3c9508bada52752623
--- /dev/null
+++ b/third_party/hta/hta_ops.h
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * You probably want to
+ *
+ *    ##    #####   #####
+ *   #  #   #    #  #    #
+ *  #    #  #    #  #    #
+ *  ######  #    #  #    #
+ *  #    #  #    #  #    #
+ *  #    #  #####   #####
+ *
+ *
+ *  #    #   ####   #####   ######   ####
+ *  ##   #  #    #  #    #  #       #
+ *  # #  #  #    #  #    #  #####    ####
+ *  #  # #  #    #  #    #  #            #
+ *  #   ##  #    #  #    #  #       #    #
+ *  #    #   ####   #####   ######   ####
+ *
+ *
+ *    ##     #####
+ *   #  #      #
+ *  #    #     #
+ *  ######     #
+ *  #    #     #
+ *  #    #     #
+ *
+ *
+ *   #####  #    #  ######
+ *     #    #    #  #
+ *     #    ######  #####
+ *     #    #    #  #
+ *     #    #    #  #
+ *     #    #    #  ######
+ *
+ *
+ *  ######  #    #  #####
+ *  #       ##   #  #    #
+ *  #####   # #  #  #    #
+ *  #       #  # #  #    #
+ *  #       #   ##  #    #
+ *  ######  #    #  #####
+ *
+ * otherwise the interface becomes incompatible.
+ */
+HTA_DEF_OP(INPUT)
+HTA_DEF_OP(OUTPUT)
+HTA_DEF_OP(Nop)
+HTA_DEF_OP(Const)
+HTA_DEF_OP(Check)
+HTA_DEF_OP(Close_f)
+HTA_DEF_OP(Close_quint8)
+HTA_DEF_OP(Close_q_quint8)
+HTA_DEF_OP(Close_int32)
+HTA_DEF_OP(Close_qint32)
+HTA_DEF_OP(PPrint_8)
+HTA_DEF_OP(PPrint_32)
+HTA_DEF_OP(PPrint_f)
+HTA_DEF_OP(PreFree)
+HTA_DEF_OP(Flatten)
+
+#ifndef HTA_DEF_OP_WREF
+#define HTA_DEF_OP_WREF(NAME) HTA_DEF_OP(NAME) HTA_DEF_OP(NAME##_ref)
+#define __SELF_HTA_DEF_OP_WREF
+#endif
+
+HTA_DEF_OP_WREF(QuantizedConv2d_8x8to32)
+HTA_DEF_OP_WREF(QuantizedMatMul_8x8to32)
+HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to8)
+HTA_DEF_OP_WREF(QuantizedRelu_8)
+HTA_DEF_OP_WREF(QuantizedReluX_8)
+HTA_DEF_OP_WREF(QuantizedMaxPool_8)
+HTA_DEF_OP_WREF(QuantizedAvgPool_8)
+HTA_DEF_OP_WREF(QuantizedL2Pool_8)
+HTA_DEF_OP_WREF(QuantizedConcat_8)
+HTA_DEF_OP_WREF(QuantizedBiasAdd_8p8to32)
+HTA_DEF_OP_WREF(Min_f)
+HTA_DEF_OP_WREF(Max_f)
+HTA_DEF_OP_WREF(Quantize)
+HTA_DEF_OP_WREF(Dequantize)
+HTA_DEF_OP_WREF(Supernode_8x8p8to8)
+
+HTA_DEF_OP(QuantizedFlatten)
+HTA_DEF_OP(Softmax_f)
+HTA_DEF_OP(Conv2d_f)
+HTA_DEF_OP(MatMul_f)
+HTA_DEF_OP(Relu_f)
+HTA_DEF_OP(ReluX_f)
+HTA_DEF_OP(AvgPool_f)
+HTA_DEF_OP(L2Pool_f)
+HTA_DEF_OP(MaxPool_f)
+HTA_DEF_OP(Concat_f)
+HTA_DEF_OP(BiasAdd_f)
+HTA_DEF_OP(LRN_f)
+
+HTA_DEF_OP(Variable)
+HTA_DEF_OP(Assign)
+HTA_DEF_OP(Reshape)
+HTA_DEF_OP(QuantizedReshape)
+HTA_DEF_OP(Tanh_f)
+HTA_DEF_OP(Sigmoid_f)
+HTA_DEF_OP(Slice_8)
+HTA_DEF_OP(Slice_f)
+HTA_DEF_OP(QuantizedSlice_8)
+HTA_DEF_OP(Add_f)
+HTA_DEF_OP(Mul_f)
+HTA_DEF_OP(Minimum_f)
+HTA_DEF_OP(Maximum_f)
+
+HTA_DEF_OP_WREF(Requantize_32to8)
+HTA_DEF_OP_WREF(RequantizationRange_32)
+
+HTA_DEF_OP(Neg_f)
+HTA_DEF_OP(Sub_f)
+HTA_DEF_OP(AddN_f)
+HTA_DEF_OP(Range_int32)
+HTA_DEF_OP(Rank_int32)
+HTA_DEF_OP(Transpose_int32)
+HTA_DEF_OP(Transpose_f)
+HTA_DEF_OP(InstanceNorm_f)
+HTA_DEF_OP_WREF(QuantizedInstanceNorm_8)
+HTA_DEF_OP(Sub_int32)
+HTA_DEF_OP(Add_int32)
+HTA_DEF_OP(Split_f)
+HTA_DEF_OP(Dequantize_qint32_f)
+HTA_DEF_OP(PRelu_f)
+HTA_DEF_OP_WREF(QuantizedPRelu_8)
+HTA_DEF_OP(Sum_f)
+HTA_DEF_OP(Prod_f)
+HTA_DEF_OP(Mul_int32)
+HTA_DEF_OP(LogicalAnd_int32)
+HTA_DEF_OP(LogicalOr_int32)
+HTA_DEF_OP(LogicalXor_int32)
+HTA_DEF_OP(Shape_int32)
+HTA_DEF_OP(Pack_int32)
+HTA_DEF_OP(MirrorPad_f)
+HTA_DEF_OP(ResizeNearestNeighbor_f)
+HTA_DEF_OP(StridedSlice_int32)
+HTA_DEF_OP(StridedSlice_f)
+HTA_DEF_OP(ExpandDims_int32)
+HTA_DEF_OP(ExpandDims_f)
+
+HTA_DEF_OP(LogSoftmax_f)
+HTA_DEF_OP(Split_int32)
+HTA_DEF_OP(QuantizedSplit_8)
+
+HTA_DEF_OP(Deconv_f)
+HTA_DEF_OP_WREF(QuantizedDeconv_8x8to32)
+
+HTA_DEF_OP_WREF(QuantizedMul_8x8to32)
+HTA_DEF_OP_WREF(QuantizedAdd_8p8to32)
+HTA_DEF_OP_WREF(QuantizedSigmoid_8)
+HTA_DEF_OP_WREF(QuantizedTanh_8)
+HTA_DEF_OP_WREF(QuantizedSoftmax_8)
+HTA_DEF_OP_WREF(QuantizedLRN_8)
+HTA_DEF_OP_WREF(Quantizedpad2d_frame_8p)
+HTA_DEF_OP_WREF(QuantizedSub_8p8to32)
+HTA_DEF_OP_WREF(QuantizedMaximum_8)
+HTA_DEF_OP_WREF(QuantizedMinimum_8)
+
+HTA_DEF_OP(Pad_f)
+HTA_DEF_OP(SpaceToBatchND_f)
+HTA_DEF_OP(BatchToSpaceND_f)
+HTA_DEF_OP(QuantizedPad_8)
+HTA_DEF_OP(ResizeBilinear_f)
+HTA_DEF_OP(ConcatV2_f)
+HTA_DEF_OP(ConcatV2_int32)
+HTA_DEF_OP(Prod_int32)
+HTA_DEF_OP(Slice_int32)
+
+HTA_DEF_OP(QuantizedAdd_8p8to8)
+HTA_DEF_OP(QuantizedResizeBilinear_8)
+HTA_DEF_OP(Supernode_8x8p8to8_d32)
+HTA_DEF_OP(Convert_to_d32)
+HTA_DEF_OP(Convert_from_d32)
+HTA_DEF_OP_WREF(QuantizedMaxPool_8_d32)
+HTA_DEF_OP_WREF(QuantizedConcat_8_d32)
+HTA_DEF_OP_WREF(QuantizedAvgPool_8_d32)
+
+HTA_DEF_OP(Sink)
+
+HTA_DEF_OP_WREF(QuantizedPRelu_8_d32)
+HTA_DEF_OP_WREF(AutoQuantize)
+HTA_DEF_OP_WREF(QuantizedDepthwiseConv2d_8x8to32)
+HTA_DEF_OP_WREF(DepthwiseConv2d_f)
+HTA_DEF_OP(DepthwiseSupernode_8x8p8to8)
+HTA_DEF_OP(DepthwiseSupernode_8x8p8to8_d32)
+
+HTA_DEF_OP_WREF(QuantizedMul_8x8to8_d32)
+
+HTA_DEF_OP(FullyConnected_u8)
+#if 0
+HTA_DEF_OP_WREF(QuantizedFC_8x8p8to8)
+#endif
+
+HTA_DEF_OP_WREF(QuantizedAdd_8p8to8_d32)
+
+HTA_DEF_OP_WREF(QuantizedClamp_8)
+HTA_DEF_OP(Clamp_f)
+HTA_DEF_OP(QuantizeForTest_d32)
+HTA_DEF_OP(Close_d32)
+HTA_DEF_OP_WREF(QuantizedSub_8p8to8_d32)
+
+HTA_DEF_OP(InputSupernode_8x8p8to8_outd32)
+HTA_DEF_OP(QuantizedLRN_8_d32)
+HTA_DEF_OP_WREF(QuantizedBiasAdd_32p32to32)
+HTA_DEF_OP_WREF(Quantize_int32)
+
+HTA_DEF_OP(Supernode_8x8p32to8)
+HTA_DEF_OP(DepthwiseSupernode_8x8p32to8)
+HTA_DEF_OP(Supernode_8x8p32to8_d32)
+HTA_DEF_OP(DepthwiseSupernode_8x8p32to8_d32)
+HTA_DEF_OP(InputSupernode_8x8p32to8_outd32)
+
+HTA_DEF_OP(PPrint_8_d32)
+HTA_DEF_OP(PPrintWithPadding_8_d32)
+HTA_DEF_OP_WREF(AutoQuantize_d32)
+
+HTA_DEF_OP_WREF(QuantizedTanh_8_d32)
+HTA_DEF_OP_WREF(QuantizedSigmoid_8_d32)
+HTA_DEF_OP_WREF(QuantizedSoftmax_8_d32)
+
+
+HTA_DEF_OP_WREF(QuantizedL2Pool_8_d32)
+
+HTA_DEF_OP(Gather_f)
+HTA_DEF_OP(Gather_int32)
+HTA_DEF_OP(Gather_8)
+HTA_DEF_OP(Table_f)
+HTA_DEF_OP(Table_int32)
+HTA_DEF_OP(Table_8)
+
+HTA_DEF_OP(FillPadding_8_d32)
+HTA_DEF_OP(QuantizedResizeBilinear_8_d32)
+
+HTA_DEF_OP(QuantizeINPUT_f_to_8)
+HTA_DEF_OP_WREF(DeconvBias_8x8to32)
+
+HTA_DEF_OP(SpaceToBatchND_8)
+HTA_DEF_OP(BatchToSpaceND_8)
+
+
+HTA_DEF_OP(SpaceToDepth_f)
+HTA_DEF_OP(DepthToSpace_f)
+HTA_DEF_OP(SpaceToDepth_8)
+HTA_DEF_OP(DepthToSpace_8)
+
+HTA_DEF_OP(DequantizeOUTPUT_8tof)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8_d32)
+HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8_d32)
+
+HTA_DEF_OP_WREF(QuantizedInstanceNorm_8_d32)
+HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8)
+HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8_d32)
+
+HTA_DEF_OP(SuperFC_8x8p32to8)
+HTA_DEF_OP(SuperFC_8x8p32to8_ref)
+HTA_DEF_OP(SuperFC_8x8p32to8_d32)
+
+HTA_DEF_OP(ChannelShuffle_f)
+HTA_DEF_OP(ChannelShuffle_int32)
+HTA_DEF_OP_WREF(QuantizedChannelShuffle_8)
+HTA_DEF_OP(QuantizedChannelShuffle_8_d32)
+/* this is in op_chanshuffle_d32.c*/
+HTA_DEF_OP(QuantizedSplit_8_d32)
+
+HTA_DEF_OP(QuantizedCrop_8)
+HTA_DEF_OP(ResizeUnitSquare_f)
+HTA_DEF_OP_WREF(ResizeUnitSquare_8)
+HTA_DEF_OP_WREF(Nv21ToRgb_8)
+HTA_DEF_OP_WREF(RgbaToRgb_8)
+HTA_DEF_OP_WREF(Argb32ToRgb_8)
+HTA_DEF_OP(Permute_f)
+HTA_DEF_OP(QuantizedPermute_8)
+HTA_DEF_OP_WREF(QuantizedRoiPool_8)
+HTA_DEF_OP(Proposal_f)
+HTA_DEF_OP(RoiAlign_f)
+HTA_DEF_OP_WREF(QuantizedRoiAlign_8)
+HTA_DEF_OP_WREF(Implode_8)
+HTA_DEF_OP(QuantizedConcat_8_nond32)
+
+HTA_DEF_OP(Close_16tof)
+HTA_DEF_OP(QuantizedLstmInput_16x16to16)
+HTA_DEF_OP(QuantizedLstmOutput_16x16to8)
+
+HTA_DEF_OP(Quantize_16)
+HTA_DEF_OP(Dequantize_16)
+HTA_DEF_OP(Convert_8_16)
+HTA_DEF_OP(QuantizedTanh_16)
+HTA_DEF_OP(QuantizedSigmoid_16)
+
+HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to16)
+HTA_DEF_OP_WREF(Requantize_32to16)
+HTA_DEF_OP_WREF(QuantizedMatMul_8x8p32to16)
+
+HTA_DEF_OP(QuantizedStridedSlice_8)
+HTA_DEF_OP(Bbox_Transform_f)
+HTA_DEF_OP(Softmax_uint8)
+
+HTA_DEF_OP(QuantizedFakeConcat_8_d32)
+
+HTA_DEF_OP(DepthToSpace_8_d32)
+HTA_DEF_OP(OemNode)
+
+HTA_DEF_OP(QuantizedPad_8_d32)
+// Add new operations above this line
+#ifdef __SELF_HTA_DEF_OP_WREF
+#undef __SELF_HTA_DEF_OP_WREF
+#undef HTA_DEF_OP_WREF
+#endif
+
diff --git a/third_party/hta/libhta_dsp_skel.so b/third_party/hta/libhta_dsp_skel.so
new file mode 100644
index 0000000000000000000000000000000000000000..6a371cfef8f47e6541be0f6bc307d9ed72aa5c7a
Binary files /dev/null and b/third_party/hta/libhta_dsp_skel.so differ
diff --git a/third_party/hta/license.txt b/third_party/hta/license.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1fc186df55d1d4b6d43eaea9f7e77be6bc470459
--- /dev/null
+++ b/third_party/hta/license.txt
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
diff --git a/third_party/nnlib/BUILD b/third_party/nnlib/BUILD.bazel
similarity index 100%
rename from third_party/nnlib/BUILD
rename to third_party/nnlib/BUILD.bazel
diff --git a/tools/aarch64_compiler/BUILD b/tools/aarch64_compiler/BUILD.bazel
similarity index 100%
rename from tools/aarch64_compiler/BUILD
rename to tools/aarch64_compiler/BUILD.bazel
diff --git a/tools/aarch64_compiler/linaro_linux_gcc/BUILD b/tools/aarch64_compiler/linaro_linux_gcc/BUILD.bazel
similarity index 100%
rename from tools/aarch64_compiler/linaro_linux_gcc/BUILD
rename to tools/aarch64_compiler/linaro_linux_gcc/BUILD.bazel
diff --git a/tools/arm_compiler/BUILD b/tools/arm_compiler/BUILD.bazel
similarity index 93%
rename from tools/arm_compiler/BUILD
rename to tools/arm_compiler/BUILD.bazel
index 30f83eb0a4f55815009c4d262cff634ec1516e48..140ed0d38c6a0f0a3be22daf73b959f78bb3e755 100644
--- a/tools/arm_compiler/BUILD
+++ b/tools/arm_compiler/BUILD.bazel
@@ -11,7 +11,7 @@
 filegroup(
     name = "toolchain_fg",
     srcs = [
-        ":cc-compiler-armeabi-v7a",
+        ":cc-compiler-armhf",
         ":linaro_linux_all_files",
         "@gcc_linaro_7_3_1_arm_linux_gnueabihf//:compiler_components",
     ],
@@ -29,7 +29,7 @@ cc_toolchain_suite(
     name = "toolchain",
     # target_cpu | compiler
     toolchains = {
-        "armeabi-v7a|gcc": "cc-compiler-armeabi-v7a",
+        "armhf|gcc": "cc-compiler-armhf",
     },
 )
 
@@ -66,10 +66,10 @@ filegroup(
 )
 
 cc_toolchain(
-    name = "cc-compiler-armeabi-v7a",
+    name = "cc-compiler-armhf",
     all_files = ":linaro_linux_all_files",
     compiler_files = ":linaro_linux_compiler_files",
-    cpu = "armeabi-v7a",
+    cpu = "armhf",
     dwp_files = ":empty",
     dynamic_runtime_libs = [":empty"],
     linker_files = ":linaro_linux_linker_files",
diff --git a/tools/arm_compiler/CROSSTOOL b/tools/arm_compiler/CROSSTOOL
index ce7f6d15ccc177a5fcbc0e94a56438d5ef5278cb..58edd2f976cc37f2d27fc2763647d6cab881c080 100644
--- a/tools/arm_compiler/CROSSTOOL
+++ b/tools/arm_compiler/CROSSTOOL
@@ -1,9 +1,9 @@
 major_version: "local"
 minor_version: ""
-default_target_cpu: "armeabi-v7a"
+default_target_cpu: "armhf"
 
 default_toolchain {
-  cpu: "armeabi-v7a"
+  cpu: "armhf"
   toolchain_identifier: "arm-linux-gnueabihf"
 }
 
@@ -12,7 +12,7 @@ toolchain {
   abi_libc_version: ""
   builtin_sysroot: ""
   compiler: "gcc"
-  host_system_name: "armeabi-v7a"
+  host_system_name: "armhf"
   needsPic: true
   supports_gold_linker: true
   supports_incremental_linker: false
@@ -22,7 +22,7 @@ toolchain {
   supports_start_end_lib: false
   supports_thin_archives: true
   target_libc: ""
-  target_cpu: "armeabi-v7a"
+  target_cpu: "armhf"
   target_system_name: ""
   toolchain_identifier: "arm-linux-gnueabihf"
 
diff --git a/tools/arm_compiler/linaro_linux_gcc/BUILD b/tools/arm_compiler/linaro_linux_gcc/BUILD.bazel
similarity index 100%
rename from tools/arm_compiler/linaro_linux_gcc/BUILD
rename to tools/arm_compiler/linaro_linux_gcc/BUILD.bazel
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 77978ea3930d3df56ab216f50c3687a05037e16d..067cb7e1772cca81dc8bd9fb9713c7ef1aa151c4 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -14,20 +14,49 @@ build --copt=-DGEMMLOWP_USE_OPENMP
 build:symbol_hidden --copt=-fvisibility=hidden
 
 # Usage example: bazel build --config android
+build:android --linkopt=-pie
+build:android --linkopt=-ldl
+build:android --linkopt=-llog
+build:android --linkopt=-lm
 build:android --distinct_host_configuration=true
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 
+# Linux host build, --config linux
+build:linux --define linux=true
+
+# MacOS host build, --config darwin
+build:darwin --define darwin=true
+
+# iOS and other darwin platforms, --config ios
+build:ios --define darwin=true
+build:ios --distinct_host_configuration=true
+build:ios --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:ios --cpu=arm64
+
+# Linux host build, --config linux
+build:linux --define linux=true
+
+# MacOS host build, --config darwin
+build:darwin --define darwin=true
+
+# iOS and other darwin platforms, --config ios
+build:ios --define darwin=true
+build:ios --distinct_host_configuration=true
+build:ios --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:ios --cpu=ios_arm64
+
 
 # Usage example: bazel build --config arm_linux_gnueabihf
 # Used to fix library not find linking issue, see also:
 # https://github.com/bazelbuild/bazel/issues/6653,
 # https://github.com/bazelbuild/bazel/issues/6189
+build:arm_linux_gnueabihf --define linux=true
 build:arm_linux_gnueabihf --spawn_strategy=standalone
 build:arm_linux_gnueabihf --distinct_host_configuration=true
 build:arm_linux_gnueabihf --crosstool_top=//tools/arm_compiler:toolchain
 build:arm_linux_gnueabihf --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:arm_linux_gnueabihf --cpu=armeabi-v7a
+build:arm_linux_gnueabihf --cpu=armhf
 build:arm_linux_gnueabihf --copt -mfloat-abi=hard
 build:arm_linux_gnueabihf --copt -mfpu=neon
 build:arm_linux_gnueabihf --copt -Wno-ignored-attributes
@@ -36,6 +65,7 @@ build:arm_linux_gnueabihf --copt -Wno-sequence-point
 build:arm_linux_gnueabihf --copt -Wno-implicit-fallthrough
 
 # Usage example: bazel build --config aarch64_linux_gnu
+build:aarch64_linux_gnu --define linux=true
 build:aarch64_linux_gnu --spawn_strategy=standalone
 build:aarch64_linux_gnu --distinct_host_configuration=true
 build:aarch64_linux_gnu --crosstool_top=//tools/aarch64_compiler:toolchain
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 37d02b78f993dc4fcb38e9359ba404b4ed89eed5..328620c1f179869e36b4340199a6aefbe85f4466 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -86,7 +86,7 @@ def parse_args():
         type=str,
         default="all",
         help="SoCs (ro.board.platform from getprop) to build, "
-             "comma seperated list or all/random")
+        "comma seperated list or all/random")
     parser.add_argument(
         "--target", type=str, default="//...", help="Bazel target to build")
     parser.add_argument(
@@ -118,8 +118,8 @@ def parse_args():
         '--device_yml',
         type=str,
         default='',
-        help='embedded linux device config yml file'
-    )
+        help='embedded linux device config yml file')
+    parser.add_argument('--vlog_level', type=int, default=0, help='vlog level')
     return parser.parse_known_args()
 
 
@@ -130,10 +130,12 @@ def main(unused_args):
 
     for target_abi in target_abis:
         toolchain = infer_toolchain(target_abi)
-        sh_commands.bazel_build(target, abi=target_abi,
-                                toolchain=toolchain,
-                                enable_neon=FLAGS.enable_neon,
-                                address_sanitizer=FLAGS.address_sanitizer)
+        sh_commands.bazel_build(
+            target,
+            abi=target_abi,
+            toolchain=toolchain,
+            enable_neon=FLAGS.enable_neon,
+            address_sanitizer=FLAGS.address_sanitizer)
         if FLAGS.run_target:
             target_devices = DeviceManager.list_devices(FLAGS.device_yml)
             if FLAGS.target_socs != TargetSOCTag.all and\
@@ -158,12 +160,11 @@ def main(unused_args):
                     bin_name,
                     args=FLAGS.args,
                     opencl_profiling=True,
-                    vlog_level=0,
+                    vlog_level=FLAGS.vlog_level,
                     out_of_range_check=True,
                     address_sanitizer=FLAGS.address_sanitizer,
                     simpleperf=FLAGS.simpleperf)
-                globals()[FLAGS.stdout_processor](stdouts, dev,
-                                                  target_abi)
+                globals()[FLAGS.stdout_processor](stdouts, dev, target_abi)
 
 
 if __name__ == "__main__":
diff --git a/tools/common.py b/tools/common.py
index 8e69ed8ed20cb8d20b83d2492afe3f377a65c8c3..82a25e5d5e6c04c1db474f93cf7dd21c3d1d48d3 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -129,6 +129,14 @@ class DeviceType(object):
     CPU = 'CPU'
     GPU = 'GPU'
     HEXAGON = 'HEXAGON'
+    HTA = 'HTA'
+
+
+class DataFormat(object):
+    NONE = "NONE"
+    NHWC = "NHWC"
+    NCHW = "NCHW"
+    OIHW = "OIHW"
 
 
 ################################
@@ -193,6 +201,8 @@ def parse_device_type(runtime):
 
     if runtime == RuntimeType.dsp:
         device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.hta:
+        device_type = DeviceType.HTA
     elif runtime == RuntimeType.gpu:
         device_type = DeviceType.GPU
     elif runtime == RuntimeType.cpu:
@@ -401,6 +411,7 @@ class YAMLKeyword(object):
     graph_optimize_options = 'graph_optimize_options'  # internal use for now
     cl_mem_type = 'cl_mem_type'
     backend = 'backend'
+    validation_outputs_data = 'validation_outputs_data'
     docker_image_tag = 'docker_image_tag'
     dockerfile_path = 'dockerfile_path'
     dockerfile_sha256_checksum = 'dockerfile_sha256_checksum'
@@ -506,6 +517,7 @@ class RuntimeType(object):
     cpu = 'cpu'
     gpu = 'gpu'
     dsp = 'dsp'
+    hta = 'hta'
     cpu_gpu = 'cpu+gpu'
 
 
@@ -524,3 +536,10 @@ class ToolchainType:
 class TargetSOCTag:
     all = 'all'
     random = 'random'
+
+
+def split_shape(shape):
+    if shape.strip() == "":
+        return []
+    else:
+        return shape.split(',')
diff --git a/tools/converter.py b/tools/converter.py
index b3a6569638137b52d25d3ac40b246eb8aba3bf8c..99a24b877e44c650cac5dbd7aea4a5a213df492f 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -14,12 +14,9 @@
 
 import argparse
 import glob
-import hashlib
-import os
-import re
 import sh
 import sys
-import urllib
+import time
 import yaml
 
 from enum import Enum
@@ -64,6 +61,7 @@ RuntimeTypeStrs = [
     "cpu",
     "gpu",
     "dsp",
+    "hta",
     "cpu+gpu"
 ]
 
@@ -96,14 +94,11 @@ WinogradParameters = [0, 2, 4]
 DataFormatStrs = [
     "NONE",
     "NHWC",
+    "NCHW",
+    "OIHW",
 ]
 
 
-class DataFormat(object):
-    NONE = "NONE"
-    NHWC = "NHWC"
-
-
 class DefaultValues(object):
     mace_lib_type = MACELibType.static
     omp_num_threads = -1,
@@ -149,6 +144,8 @@ def parse_device_type(runtime):
 
     if runtime == RuntimeType.dsp:
         device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.hta:
+        device_type = DeviceType.HTA
     elif runtime == RuntimeType.gpu:
         device_type = DeviceType.GPU
     elif runtime == RuntimeType.cpu:
@@ -170,6 +167,19 @@ def get_hexagon_mode(configs):
     return False
 
 
+def get_hta_mode(configs):
+    runtime_list = []
+    for model_name in configs[YAMLKeyword.models]:
+        model_runtime = \
+            configs[YAMLKeyword.models][model_name].get(
+                YAMLKeyword.runtime, "")
+        runtime_list.append(model_runtime.lower())
+
+    if RuntimeType.hta in runtime_list:
+        return True
+    return False
+
+
 def get_opencl_mode(configs):
     runtime_list = []
     for model_name in configs[YAMLKeyword.models]:
@@ -371,6 +381,15 @@ def format_model_config(flags):
                 if not isinstance(value, list):
                     subgraph[key] = [value]
                 subgraph[key] = [str(v) for v in subgraph[key]]
+            input_size = len(subgraph[YAMLKeyword.input_tensors])
+            output_size = len(subgraph[YAMLKeyword.output_tensors])
+
+            mace_check(len(subgraph[YAMLKeyword.input_shapes]) == input_size,
+                       ModuleName.YAML_CONFIG,
+                       "input shapes' size not equal inputs' size.")
+            mace_check(len(subgraph[YAMLKeyword.output_shapes]) == output_size,
+                       ModuleName.YAML_CONFIG,
+                       "output shapes' size not equal outputs' size.")
 
             for key in [YAMLKeyword.check_tensors,
                         YAMLKeyword.check_shapes]:
@@ -399,13 +418,13 @@ def format_model_config(flags):
             if input_data_formats:
                 if not isinstance(input_data_formats, list):
                     subgraph[YAMLKeyword.input_data_formats] =\
-                        [input_data_formats]
+                        [input_data_formats] * input_size
                 else:
                     mace_check(len(input_data_formats)
-                               == len(subgraph[YAMLKeyword.input_tensors]),
+                               == input_size,
                                ModuleName.YAML_CONFIG,
                                "input_data_formats should match"
-                               " the size of input")
+                               " the size of input.")
                 for input_data_format in\
                         subgraph[YAMLKeyword.input_data_formats]:
                     mace_check(input_data_format in DataFormatStrs,
@@ -414,17 +433,18 @@ def format_model_config(flags):
                                + str(DataFormatStrs) + ", but got "
                                + input_data_format)
             else:
-                subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC]
+                subgraph[YAMLKeyword.input_data_formats] = \
+                    [DataFormat.NHWC] * input_size
 
             output_data_formats = subgraph.get(YAMLKeyword.output_data_formats,
                                                [])
             if output_data_formats:
                 if not isinstance(output_data_formats, list):
                     subgraph[YAMLKeyword.output_data_formats] = \
-                        [output_data_formats]
+                        [output_data_formats] * output_size
                 else:
                     mace_check(len(output_data_formats)
-                               == len(subgraph[YAMLKeyword.output_tensors]),
+                               == output_size,
                                ModuleName.YAML_CONFIG,
                                "output_data_formats should match"
                                " the size of output")
@@ -435,7 +455,8 @@ def format_model_config(flags):
                                "'output_data_formats' must be in "
                                + str(DataFormatStrs))
             else:
-                subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC]
+                subgraph[YAMLKeyword.output_data_formats] =\
+                    [DataFormat.NHWC] * output_size
 
             validation_threshold = subgraph.get(
                 YAMLKeyword.validation_threshold, {})
@@ -448,6 +469,8 @@ def format_model_config(flags):
                 DeviceType.GPU: ValidationThreshold.gpu_threshold,
                 DeviceType.HEXAGON + "_QUANTIZE":
                     ValidationThreshold.hexagon_threshold,
+                DeviceType.HTA + "_QUANTIZE":
+                    ValidationThreshold.hexagon_threshold,
                 DeviceType.CPU + "_QUANTIZE":
                     ValidationThreshold.cpu_quantize_threshold,
             }
@@ -457,6 +480,7 @@ def format_model_config(flags):
                 if k.upper() not in (DeviceType.CPU,
                                      DeviceType.GPU,
                                      DeviceType.HEXAGON,
+                                     DeviceType.HTA,
                                      DeviceType.CPU + "_QUANTIZE"):
                     raise argparse.ArgumentTypeError(
                         'Unsupported validation threshold runtime: %s' % k)
@@ -476,6 +500,14 @@ def format_model_config(flags):
             onnx_backend = subgraph.get(
                 YAMLKeyword.backend, "tensorflow")
             subgraph[YAMLKeyword.backend] = onnx_backend
+            validation_outputs_data = subgraph.get(
+                YAMLKeyword.validation_outputs_data, [])
+            if not isinstance(validation_outputs_data, list):
+                subgraph[YAMLKeyword.validation_outputs_data] = [
+                    validation_outputs_data]
+            else:
+                subgraph[YAMLKeyword.validation_outputs_data] = \
+                    validation_outputs_data
             input_ranges = subgraph.get(
                 YAMLKeyword.input_ranges, [])
             if not isinstance(input_ranges, list):
@@ -728,7 +760,6 @@ def build_model_lib(configs, address_sanitizer):
     # create model library dir
     library_name = configs[YAMLKeyword.library_name]
     for target_abi in configs[YAMLKeyword.target_abis]:
-        hexagon_mode = get_hexagon_mode(configs)
         model_lib_output_path = get_model_lib_output_path(library_name,
                                                           target_abi)
         library_out_dir = os.path.dirname(model_lib_output_path)
@@ -739,7 +770,8 @@ def build_model_lib(configs, address_sanitizer):
             MODEL_LIB_TARGET,
             abi=target_abi,
             toolchain=toolchain,
-            hexagon_mode=hexagon_mode,
+            enable_hexagon=get_hexagon_mode(configs),
+            enable_hta=get_hta_mode(configs),
             enable_opencl=get_opencl_mode(configs),
             enable_quantize=get_quantize_mode(configs),
             address_sanitizer=address_sanitizer,
@@ -830,7 +862,6 @@ def report_run_statistics(stdout,
 def build_mace_run(configs, target_abi, toolchain, enable_openmp,
                    address_sanitizer, mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
 
     build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
     if os.path.exists(build_tmp_binary_dir):
@@ -853,7 +884,8 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
         mace_run_target,
         abi=target_abi,
         toolchain=toolchain,
-        hexagon_mode=hexagon_mode,
+        enable_hexagon=get_hexagon_mode(configs),
+        enable_hta=get_hta_mode(configs),
         enable_openmp=enable_openmp,
         enable_opencl=get_opencl_mode(configs),
         enable_quantize=get_quantize_mode(configs),
@@ -868,7 +900,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
 def build_example(configs, target_abi, toolchain,
                   enable_openmp, mace_lib_type, cl_binary_to_code, device):
     library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
 
     build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
     if os.path.exists(build_tmp_binary_dir):
@@ -902,7 +933,8 @@ def build_example(configs, target_abi, toolchain,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                             address_sanitizer=flags.address_sanitizer,
                             symbol_hidden=symbol_hidden)
 
@@ -933,7 +965,8 @@ def build_example(configs, target_abi, toolchain,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                             address_sanitizer=flags.address_sanitizer,
                             extra_args=build_arg)
 
@@ -991,8 +1024,11 @@ def run_mace(flags):
                                    flags.address_sanitizer,
                                    flags.mace_lib_type)
                 # run
+                start_time = time.time()
                 with device.lock():
                     device.run_specify_abi(flags, configs, target_abi)
+                elapse_minutes = (time.time() - start_time) / 60
+                print("Elapse time: %f minutes." % elapse_minutes)
             elif dev[YAMLKeyword.device_name] != SystemType.host:
                 six.print_('The device with soc %s do not support abi %s' %
                            (dev[YAMLKeyword.target_socs], target_abi),
@@ -1013,7 +1049,6 @@ def build_benchmark_model(configs,
                           enable_openmp,
                           mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
-    hexagon_mode = get_hexagon_mode(configs)
 
     link_dynamic = mace_lib_type == MACELibType.dynamic
     if link_dynamic:
@@ -1036,7 +1071,8 @@ def build_benchmark_model(configs,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
-                            hexagon_mode=hexagon_mode,
+                            enable_hexagon=get_hexagon_mode(configs),
+                            enable_hta=get_hta_mode(configs),
                             symbol_hidden=symbol_hidden,
                             extra_args=build_arg)
     # clear tmp binary dir
@@ -1075,8 +1111,11 @@ def benchmark_model(flags):
                                       not flags.disable_openmp,
                                       flags.mace_lib_type)
                 device = DeviceWrapper(dev)
+                start_time = time.time()
                 with device.lock():
                     device.bm_specific_target(flags, configs, target_abi)
+                elapse_minutes = (time.time() - start_time) / 60
+                print("Elapse time: %f minutes." % elapse_minutes)
             else:
                 six.print_('There is no abi %s with soc %s' %
                            (target_abi, dev[YAMLKeyword.target_socs]),
diff --git a/tools/device.py b/tools/device.py
index 07e92878db3b0e0effc6814aa5d68df10dc16983..0ff868f482cf0a9d6d7e69cd854a10ee863d51ae 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -154,7 +154,9 @@ class DeviceWrapper:
                    input_nodes,
                    output_nodes,
                    input_shapes,
+                   input_data_formats,
                    output_shapes,
+                   output_data_formats,
                    mace_model_dir,
                    model_tag,
                    device_type,
@@ -206,6 +208,7 @@ class DeviceWrapper:
             p = subprocess.Popen(
                 [
                     "env",
+                    "ASAN_OPTIONS=detect_leaks=1",
                     "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_path,
                     "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                     "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
@@ -216,6 +219,8 @@ class DeviceWrapper:
                     "--output_node=%s" % ",".join(output_nodes),
                     "--input_shape=%s" % ":".join(input_shapes),
                     "--output_shape=%s" % ":".join(output_shapes),
+                    "--input_data_format=%s" % ",".join(input_data_formats),
+                    "--output_data_format=%s" % ",".join(output_data_formats),
                     "--input_file=%s/%s" % (model_output_dir,
                                             input_file_name),
                     "--output_file=%s/%s" % (model_output_dir,
@@ -307,6 +312,8 @@ class DeviceWrapper:
                 "--output_node=%s" % ",".join(output_nodes),
                 "--input_shape=%s" % ":".join(input_shapes),
                 "--output_shape=%s" % ":".join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                 "--input_file=%s/%s" % (self.data_dir, input_file_name),
                 "--output_file=%s/%s" % (self.data_dir, output_file_name),
                 "--input_dir=%s" % input_dir,
@@ -394,6 +401,8 @@ class DeviceWrapper:
             output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
             input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
             output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+            input_data_formats=subgraphs[0][YAMLKeyword.input_data_formats],
+            output_data_formats=subgraphs[0][YAMLKeyword.output_data_formats],
             mace_model_dir=mace_model_dir,
             model_tag=model_name,
             device_type=DeviceType.GPU,
@@ -587,6 +596,10 @@ class DeviceWrapper:
                             YAMLKeyword.output_tensors],
                         input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
                         output_shapes=output_config[YAMLKeyword.output_shapes],
+                        input_data_formats=subgraphs[0][
+                            YAMLKeyword.input_data_formats],
+                        output_data_formats=subgraphs[0][
+                            YAMLKeyword.output_data_formats],
                         mace_model_dir=mace_model_dir,
                         model_tag=model_name,
                         device_type=device_type,
@@ -652,6 +665,10 @@ class DeviceWrapper:
                                 YAMLKeyword.input_shapes],
                             output_shapes=output_config[
                                 YAMLKeyword.output_shapes],
+                            input_data_formats=subgraphs[0][
+                                YAMLKeyword.input_data_formats],
+                            output_data_formats=subgraphs[0][
+                                YAMLKeyword.output_data_formats],
                             model_output_dir=model_output_dir,
                             input_data_types=subgraphs[0][
                                 YAMLKeyword.input_data_types],
@@ -660,6 +677,8 @@ class DeviceWrapper:
                                 YAMLKeyword.validation_threshold][
                                 validate_type],
                             backend=subgraphs[0][YAMLKeyword.backend],
+                            validation_outputs_data=subgraphs[0][
+                                YAMLKeyword.validation_outputs_data],
                             log_file=log_file,
                         )
                     if flags.report and flags.round > 0:
@@ -748,6 +767,8 @@ class DeviceWrapper:
                         output_nodes,
                         input_shapes,
                         output_shapes,
+                        input_data_formats,
+                        output_data_formats,
                         max_num_runs,
                         max_seconds,
                         model_tag,
@@ -788,6 +809,8 @@ class DeviceWrapper:
                     '--output_node=%s' % ','.join(output_nodes),
                     '--input_shape=%s' % ':'.join(input_shapes),
                     '--output_shape=%s' % ':'.join(output_shapes),
+                    "--input_data_format=%s" % ",".join(input_data_formats),
+                    "--output_data_format=%s" % ",".join(output_data_formats),
                     '--input_file=%s/%s' % (model_output_dir, input_file_name),
                     "--model_data_file=%s" % model_data_file,
                     '--max_num_runs=%d' % max_num_runs,
@@ -843,6 +866,8 @@ class DeviceWrapper:
                 '--output_node=%s' % ','.join(output_nodes),
                 '--input_shape=%s' % ':'.join(input_shapes),
                 '--output_shape=%s' % ':'.join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                 '--input_file=%s/%s' % (self.data_dir, input_file_name),
                 "--model_data_file=%s" % model_data_file,
                 '--max_num_runs=%d' % max_num_runs,
@@ -959,6 +984,10 @@ class DeviceWrapper:
                     output_nodes=output_nodes,
                     input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
                     output_shapes=output_shapes,
+                    input_data_formats=subgraphs[0][
+                        YAMLKeyword.input_data_formats],
+                    output_data_formats=subgraphs[0][
+                        YAMLKeyword.output_data_formats],
                     max_num_runs=flags.max_num_runs,
                     max_seconds=flags.max_seconds,
                     mace_model_dir=mace_model_dir,
@@ -972,8 +1001,7 @@ class DeviceWrapper:
                     opencl_binary_file=opencl_output_bin_path,
                     opencl_parameter_file=opencl_parameter_path,
                     libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-                    link_dynamic=link_dynamic
-                )
+                    link_dynamic=link_dynamic)
 
     def run(self,
             abi,
diff --git a/tools/generate_data.py b/tools/generate_data.py
index 5ad0340e456df423fd36d37a3b565eb500fad39b..b80f0c20c9964b7e02c04b6d313f55e87a00cc20 100644
--- a/tools/generate_data.py
+++ b/tools/generate_data.py
@@ -59,7 +59,7 @@ def generate_input_data(input_file, input_node, input_shape, input_ranges,
 
     assert len(input_names) == len(input_shapes) == len(input_ranges) == len(input_data_types)  # noqa
     for i in range(len(input_names)):
-        shape = [int(x) for x in input_shapes[i].split(',')]
+        shape = [int(x) for x in common.split_shape(input_shapes[i])]
         input_range = [float(x) for x in input_ranges[i].split(',')]
         generate_data(input_names[i], shape, input_file, input_range,
                       input_data_types[i])
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 035348ff2baae673ae798ce6cf4e40c771b877b8..6a8746b1e593154d580d7143d5c98a75977ee07c 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -69,7 +69,7 @@ def device_lock_path(serialno):
     return "/tmp/device-lock-%s" % serialno
 
 
-def device_lock(serialno, timeout=3600):
+def device_lock(serialno, timeout=7200):
     import filelock
     return filelock.FileLock(device_lock_path(serialno), timeout=timeout)
 
@@ -263,7 +263,8 @@ def find_simpleperf_library(abi, simpleperf_path=''):
 def bazel_build(target,
                 abi="armeabi-v7a",
                 toolchain='android',
-                hexagon_mode=False,
+                enable_hexagon=False,
+                enable_hta=False,
                 enable_openmp=True,
                 enable_neon=True,
                 enable_opencl=True,
@@ -275,6 +276,8 @@ def bazel_build(target,
     if abi == "host":
         bazel_args = (
             "build",
+            "--config",
+            platform.system().lower(),
             "--define",
             "openmp=%s" % str(enable_openmp).lower(),
             "--define",
@@ -297,13 +300,15 @@ def bazel_build(target,
             "--define",
             "quantize=%s" % str(enable_quantize).lower(),
             "--define",
-            "hexagon=%s" % str(hexagon_mode).lower())
+            "hexagon=%s" % str(enable_hexagon).lower(),
+            "--define",
+            "hta=%s" % str(enable_hta).lower())
     if address_sanitizer:
         bazel_args += ("--config", "asan")
     else:
         bazel_args += ("--config", "optimization")
-    if symbol_hidden:
-        bazel_args += ("--config", "symbol_hidden")
+        if symbol_hidden:
+            bazel_args += ("--config", "symbol_hidden")
     if extra_args:
         bazel_args += (extra_args,)
         six.print_(bazel_args)
@@ -649,6 +654,8 @@ def validate_model(abi,
                    output_nodes,
                    input_shapes,
                    output_shapes,
+                   input_data_formats,
+                   output_data_formats,
                    model_output_dir,
                    input_data_types,
                    caffe_env,
@@ -656,9 +663,12 @@ def validate_model(abi,
                    output_file_name="model_out",
                    validation_threshold=0.9,
                    backend="tensorflow",
-                   log_file="",
-                   ):
-    six.print_("* Validate with %s" % platform)
+                   validation_outputs_data=[],
+                   log_file=""):
+    if not validation_outputs_data:
+        six.print_("* Validate with %s" % platform)
+    else:
+        six.print_("* Validate with file: %s" % validation_outputs_data)
     if abi != "host":
         for output_name in output_nodes:
             formatted_name = common.formatted_file_name(
@@ -668,21 +678,15 @@ def validate_model(abi,
                 sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name))
             device.pull_from_data_dir(formatted_name, model_output_dir)
 
-    if platform == "tensorflow":
-        validate(platform, model_file_path, "",
-                 "%s/%s" % (model_output_dir, input_file_name),
-                 "%s/%s" % (model_output_dir, output_file_name), device_type,
-                 ":".join(input_shapes), ":".join(output_shapes),
-                 ",".join(input_nodes), ",".join(output_nodes),
-                 validation_threshold, ",".join(input_data_types), backend,
-                 log_file)
-    elif platform == "onnx":
+    if platform == "tensorflow" or platform == "onnx":
         validate(platform, model_file_path, "",
                  "%s/%s" % (model_output_dir, input_file_name),
                  "%s/%s" % (model_output_dir, output_file_name), device_type,
                  ":".join(input_shapes), ":".join(output_shapes),
+                 ",".join(input_data_formats), ",".join(output_data_formats),
                  ",".join(input_nodes), ",".join(output_nodes),
                  validation_threshold, ",".join(input_data_types), backend,
+                 validation_outputs_data,
                  log_file)
     elif platform == "caffe":
         image_name = "mace-caffe:" + docker_image_tag
@@ -698,8 +702,11 @@ def validate_model(abi,
                      "%s/%s" % (model_output_dir, output_file_name),
                      device_type,
                      ":".join(input_shapes), ":".join(output_shapes),
+                     ",".join(input_data_formats),
+                     ",".join(output_data_formats),
                      ",".join(input_nodes), ",".join(output_nodes),
                      validation_threshold, ",".join(input_data_types), backend,
+                     validation_outputs_data,
                      log_file)
         elif caffe_env == common.CaffeEnvType.DOCKER:
             docker_image_id = sh.docker("images", "-q", image_name)
@@ -764,9 +771,13 @@ def validate_model(abi,
                 "--output_node=%s" % ",".join(output_nodes),
                 "--input_shape=%s" % ":".join(input_shapes),
                 "--output_shape=%s" % ":".join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                 "--validation_threshold=%f" % validation_threshold,
                 "--input_data_type=%s" % ",".join(input_data_types),
                 "--backend=%s" % ",".join(backend),
+                "--validation_outputs_data=%s" % ",".join(
+                    validation_outputs_data),
                 "--log_file=%s" % log_file,
                 _fg=True)
 
diff --git a/tools/validate.py b/tools/validate.py
index 2ea8fed2786b37ef2950deb706482d284cace6fd..47dc6c019ee790b264eb27a85732a635941db0b9 100644
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -18,6 +18,7 @@ import os
 import os.path
 import numpy as np
 import re
+import six
 
 import common
 
@@ -67,6 +68,8 @@ def calculate_similarity(u, v, data_type=np.float64):
 
 
 def calculate_pixel_accuracy(out_value, mace_out_value):
+    if len(out_value.shape) < 2:
+        return 1.0
     out_value = out_value.reshape((-1, out_value.shape[-1]))
     batches = out_value.shape[0]
     classes = out_value.shape[1]
@@ -121,10 +124,37 @@ def normalize_tf_tensor_name(name):
         return name
 
 
-def validate_tf_model(platform, device_type, model_file, input_file,
-                      mace_out_file, input_names, input_shapes,
-                      output_names, validation_threshold, input_data_types,
-                      log_file):
+def validate_with_file(platform, device_type,
+                       output_names, output_shapes,
+                       mace_out_file, validation_outputs_data,
+                       validation_threshold, log_file):
+    for i in range(len(output_names)):
+        if validation_outputs_data[i].startswith("http://") or \
+                validation_outputs_data[i].startswith("https://"):
+            validation_file_name = common.formatted_file_name(
+                mace_out_file, output_names[i] + '_validation')
+            six.moves.urllib.request.urlretrieve(validation_outputs_data[i],
+                                                 validation_file_name)
+        else:
+            validation_file_name = validation_outputs_data[i]
+        value = load_data(validation_file_name)
+        out_shape = output_shapes[i]
+        if len(out_shape) == 4:
+            out_shape[1], out_shape[2], out_shape[3] = \
+                out_shape[3], out_shape[1], out_shape[2]
+            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
+        output_file_name = common.formatted_file_name(
+            mace_out_file, output_names[i])
+        mace_out_value = load_data(output_file_name)
+        compare_output(platform, device_type, output_names[i], mace_out_value,
+                       value, validation_threshold, log_file)
+
+
+def validate_tf_model(platform, device_type, model_file,
+                      input_file, mace_out_file,
+                      input_names, input_shapes, input_data_formats,
+                      output_names, output_shapes, output_data_formats,
+                      validation_threshold, input_data_types, log_file):
     import tensorflow as tf
     if not os.path.isfile(model_file):
         common.MaceLogger.error(
@@ -147,6 +177,13 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                         common.formatted_file_name(input_file, input_names[i]),
                         input_data_types[i])
                     input_value = input_value.reshape(input_shapes[i])
+                    if input_data_formats[i] == common.DataFormat.NCHW and\
+                            len(input_shapes[i]) == 4:
+                        input_value = input_value.transpose((0, 2, 3, 1))
+                    elif input_data_formats[i] == common.DataFormat.OIHW and \
+                            len(input_shapes[i]) == 4:
+                        # OIHW -> HWIO
+                        input_value = input_value.transpose((2, 3, 1, 0))
                     input_node = graph.get_tensor_by_name(
                         normalize_tf_tensor_name(input_names[i]))
                     input_dict[input_node] = input_value
@@ -161,15 +198,20 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                     output_file_name = common.formatted_file_name(
                         mace_out_file, output_names[i])
                     mace_out_value = load_data(output_file_name)
+                    if output_data_formats[i] == common.DataFormat.NCHW and\
+                            len(output_shapes[i]) == 4:
+                        mace_out_value = mace_out_value.\
+                            reshape(output_shapes[i]).transpose((0, 2, 3, 1))
                     compare_output(platform, device_type, output_names[i],
                                    mace_out_value, output_values[i],
                                    validation_threshold, log_file)
 
 
 def validate_caffe_model(platform, device_type, model_file, input_file,
-                         mace_out_file, weight_file, input_names, input_shapes,
-                         output_names, output_shapes, validation_threshold,
-                         log_file):
+                         mace_out_file, weight_file,
+                         input_names, input_shapes, input_data_formats,
+                         output_names, output_shapes, output_data_formats,
+                         validation_threshold, log_file):
     os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
     import caffe
     if not os.path.isfile(model_file):
@@ -188,8 +230,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
     for i in range(len(input_names)):
         input_value = load_data(
             common.formatted_file_name(input_file, input_names[i]))
-        input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
-                                                                      2))
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == common.DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
         input_blob_name = input_names[i]
         try:
             if input_names[i] in net.top_names:
@@ -205,22 +249,23 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
 
     for i in range(len(output_names)):
         value = net.blobs[output_names[i]].data
-        out_shape = output_shapes[i]
-        if len(out_shape) == 4:
-            out_shape[1], out_shape[2], out_shape[3] = \
-                out_shape[3], out_shape[1], out_shape[2]
-            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
         output_file_name = common.formatted_file_name(
             mace_out_file, output_names[i])
         mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == common.DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i])\
+                .transpose((0, 3, 1, 2))
         compare_output(platform, device_type, output_names[i], mace_out_value,
                        value, validation_threshold, log_file)
 
 
-def validate_onnx_model(platform, device_type, model_file, input_file,
-                        mace_out_file, input_names, input_shapes,
-                        output_names, output_shapes, validation_threshold,
-                        input_data_types, backend, log_file):
+def validate_onnx_model(platform, device_type, model_file,
+                        input_file, mace_out_file,
+                        input_names, input_shapes, input_data_formats,
+                        output_names, output_shapes, output_data_formats,
+                        validation_threshold, input_data_types,
+                        backend, log_file):
     import onnx
     if backend == "tensorflow":
         from onnx_tf.backend import prepare
@@ -242,13 +287,16 @@ def validate_onnx_model(platform, device_type, model_file, input_file,
         input_value = load_data(common.formatted_file_name(input_file,
                                                            input_names[i]),
                                 input_data_types[i])
-        input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
-                                                                      2))
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == common.DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
         input_dict[input_names[i]] = input_value
     onnx_outputs = []
     for i in range(len(output_names)):
         out_shape = output_shapes[i]
-        if len(out_shape) == 4:
+        if output_data_formats[i] == common.DataFormat.NHWC and\
+                len(out_shape) == 4:
             out_shape[1], out_shape[2], out_shape[3] = \
                 out_shape[3], out_shape[1], out_shape[2]
         onnx_outputs.append(
@@ -262,24 +310,32 @@ def validate_onnx_model(platform, device_type, model_file, input_file,
     for i in range(len(output_names)):
         out_name = output_names[i]
         value = output_values[out_name].flatten()
-        out_shape = output_shapes[i]
-        if len(out_shape) == 4:
-            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
         output_file_name = common.formatted_file_name(mace_out_file,
                                                       output_names[i])
         mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == common.DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i]) \
+                .transpose((0, 3, 1, 2))
         compare_output(platform, device_type, output_names[i],
                        mace_out_value, value,
                        validation_threshold, log_file)
 
 
 def validate(platform, model_file, weight_file, input_file, mace_out_file,
-             device_type, input_shape, output_shape, input_node, output_node,
-             validation_threshold, input_data_type, backend, log_file):
+             device_type, input_shape, output_shape, input_data_format_str,
+             output_data_format_str, input_node, output_node,
+             validation_threshold, input_data_type, backend,
+             validation_outputs_data, log_file):
     input_names = [name for name in input_node.split(',')]
     input_shape_strs = [shape for shape in input_shape.split(':')]
-    input_shapes = [[int(x) for x in shape.split(',')]
+    input_shapes = [[int(x) for x in common.split_shape(shape)]
                     for shape in input_shape_strs]
+    output_shape_strs = [shape for shape in output_shape.split(':')]
+    output_shapes = [[int(x) for x in common.split_shape(shape)]
+                     for shape in output_shape_strs]
+    input_data_formats = [df for df in input_data_format_str.split(',')]
+    output_data_formats = [df for df in output_data_format_str.split(',')]
     if input_data_type:
         input_data_types = [data_type
                             for data_type in input_data_type.split(',')]
@@ -287,27 +343,35 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file,
         input_data_types = ['float32'] * len(input_names)
     output_names = [name for name in output_node.split(',')]
     assert len(input_names) == len(input_shapes)
-
-    if platform == 'tensorflow':
-        validate_tf_model(platform, device_type, model_file, input_file,
-                          mace_out_file, input_names, input_shapes,
-                          output_names, validation_threshold, input_data_types,
+    if not isinstance(validation_outputs_data, list):
+        if os.path.isfile(validation_outputs_data):
+            validation_outputs = [validation_outputs_data]
+        else:
+            validation_outputs = []
+    else:
+        validation_outputs = validation_outputs_data
+    if validation_outputs:
+        validate_with_file(platform, device_type, output_names, output_shapes,
+                           mace_out_file, validation_outputs,
+                           validation_threshold, log_file)
+    elif platform == 'tensorflow':
+        validate_tf_model(platform, device_type,
+                          model_file, input_file, mace_out_file,
+                          input_names, input_shapes, input_data_formats,
+                          output_names, output_shapes, output_data_formats,
+                          validation_threshold, input_data_types,
                           log_file)
     elif platform == 'caffe':
-        output_shape_strs = [shape for shape in output_shape.split(':')]
-        output_shapes = [[int(x) for x in shape.split(',')]
-                         for shape in output_shape_strs]
-        validate_caffe_model(platform, device_type, model_file, input_file,
-                             mace_out_file, weight_file, input_names,
-                             input_shapes, output_names, output_shapes,
+        validate_caffe_model(platform, device_type, model_file,
+                             input_file, mace_out_file, weight_file,
+                             input_names, input_shapes, input_data_formats,
+                             output_names, output_shapes, output_data_formats,
                              validation_threshold, log_file)
     elif platform == 'onnx':
-        output_shape_strs = [shape for shape in output_shape.split(':')]
-        output_shapes = [[int(x) for x in shape.split(',')]
-                         for shape in output_shape_strs]
-        validate_onnx_model(platform, device_type, model_file, input_file,
-                            mace_out_file, input_names, input_shapes,
-                            output_names, output_shapes,
+        validate_onnx_model(platform, device_type, model_file,
+                            input_file, mace_out_file,
+                            input_names, input_shapes, input_data_formats,
+                            output_names, output_shapes, output_data_formats,
                             validation_threshold,
                             input_data_types, backend, log_file)
 
@@ -338,8 +402,14 @@ def parse_args():
         "--device_type", type=str, default="", help="mace runtime device.")
     parser.add_argument(
         "--input_shape", type=str, default="1,64,64,3", help="input shape.")
+    parser.add_argument(
+        "--input_data_format", type=str, default="NHWC",
+        help="input data format.")
     parser.add_argument(
         "--output_shape", type=str, default="1,64,64,2", help="output shape.")
+    parser.add_argument(
+        "--output_data_format", type=str, default="NHWC",
+        help="output data format.")
     parser.add_argument(
         "--input_node", type=str, default="input_node", help="input node")
     parser.add_argument(
@@ -358,10 +428,10 @@ def parse_args():
         default="tensorflow",
         help="onnx backend framwork")
     parser.add_argument(
-        "--log_file",
-        type=str,
-        default="",
-        help="log file")
+        "--validation_outputs_data", type=str,
+        default="", help="validation outputs data file path.")
+    parser.add_argument(
+        "--log_file", type=str, default="", help="log file.")
 
     return parser.parse_known_args()
 
@@ -376,9 +446,12 @@ if __name__ == '__main__':
              FLAGS.device_type,
              FLAGS.input_shape,
              FLAGS.output_shape,
+             FLAGS.input_data_format,
+             FLAGS.output_data_format,
              FLAGS.input_node,
              FLAGS.output_node,
              FLAGS.validation_threshold,
              FLAGS.input_data_type,
              FLAGS.backend,
+             FLAGS.validation_outputs_data,
              FLAGS.log_file)