add mace micro

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

add mace micro
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
373f1eff · luxuhui · 9cd813b0 · 373f1eff · 373f1eff · 373f1eff
234 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,9 @@ mace/codegen/version/
 mace/codegen/engine/
 mace/codegen/lib/
+micro/codegen/models/
+micro/codegen/engines/
 examples/android/macelibrary/src/main/cpp/mace/
 examples/android/macelibrary/src/main/cpp/include/
 examples/android/macelibrary/src/main/cpp/lib/arm64-v8a/

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -80,12 +80,14 @@ mace_cc_test:
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
    - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//micro/test/ccunit:micro_ops_test" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a
 mace_cc_benchmark:
  stage: test
  script:
    - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
    - python tools/bazel_adb_run.py --target="//test/ccbenchmark:mace_cc_benchmark" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --args="--filter=.*SIGMOID.*"
+    - python tools/bazel_adb_run.py --target="//micro/test/ccbenchmark:micro_cc_benchmark" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a
  only:
    - triggers
@@ -112,6 +114,13 @@ model_tests:
    - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file
    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=code --model_data_format=file
    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=code --model_data_format=file --benchmark
+    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn.yml
+    - python tools/converter.py convert --config=${CONF_FILE} --enable_micro
+    - python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn
+    - python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --benchmark
+    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn-bf16.yml
+    - python tools/converter.py convert --config=${CONF_FILE} --enable_micro
+    - python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn
    - rm -rf mace-models
 quantization_tests:

--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,6 +3,7 @@ workspace(name = "mace")
 # generate version and opencl kernel code.
 load("//repository/git:git_configure.bzl", "git_version_repository")
 load("//repository/opencl-kernel:opencl_kernel_configure.bzl", "encrypt_opencl_kernel_repository")
+load("//micro:micro.bzl", "new_local_repository_env")
 git_version_repository(name = "local_version_config")
@@ -161,3 +162,15 @@ new_http_archive(
        "https://releases.linaro.org/components/toolchain/binaries/7.3-2018.05/aarch64-linux-gnu/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu.tar.xz",
    ],
 )
+new_local_repository_env(
+    name = "hexagon_sdk",
+    build_file = "third_party/hexagon/hexagon_sdk.BUILD",
+    path = "${HEXAGON_SDK_ROOT}",
+)
+new_local_repository_env(
+    name = "hexagon_tools",
+    build_file = "third_party/hexagon/hexagon_tools.BUILD",
+    path = "${HL_HEXAGON_TOOLS}",
+)
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -46,6 +46,13 @@ The main documentation is organized into the following sections:
   development/data_format
   development/dynamic_lstm
+.. toctree::
+   :maxdepth: 1
+   :caption: Micro Controllers
+   :name: sec-micro
+   micro-controllers/basic_usage.rst
 .. toctree::
   :maxdepth: 1
   :caption: FAQ

--- a/docs/micro-controllers/basic_usage.rst
+++ b/docs/micro-controllers/basic_usage.rst
+Basic usage for Micro Controllers
+==================================
+Build and run an example model
+-------------------------------
+At first, make sure the environment has been set up correctly already (refer to :doc:`../installation/env_requirement`).
+The followings are instructions about how to quickly build and run a provided model in
+`MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__.
+Here we use the har-cnn model as an example.
+**Commands**
+    1. Pull `MACE <https://github.com/XiaoMi/mace>`__ project.
+    .. code-block:: sh
+        git clone https://github.com/XiaoMi/mace.git
+        cd mace/
+        git fetch --all --tags --prune
+        # Checkout the latest tag (i.e. release version)
+        tag_name=`git describe --abbrev=0 --tags`
+        git checkout tags/${tag_name}
+    .. note::
+        It's highly recommended to use a release version instead of master branch.
+    2. Pull `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ project.
+    .. code-block:: sh
+        git clone https://github.com/XiaoMi/mace-models.git
+    3. Convert the pre-trained har-cnn model to c++ code.
+    .. code-block:: sh
+        cd path/to/mace
+        # output lib path: build/har-cnn/model/har_cnn_micro.tar.gz
+        CONF_FILE=/path/to/mace-models/micro-models/har-cnn/har-cnn.yml
+        python tools/converter.py convert --config=$CONF_FILE --enable_micro
+    4. Build Micro-Controllers engine and models to library on host.
+    .. code-block:: sh
+        # copy convert result to micro dir ``path/to/micro``
+        cp build/har-cnn/model/har_cnn_micro.tar.gz path/to/micro/
+        cd path/to/micro
+        tar zxvf har_cnn_micro.tar.gz
+        bazel build //micro/codegen:micro_engine
+    .. note::
+        - This step can be skipped if you just want to run a model using ``tools/python/run_micro.py``, such as commands in step 5.
+        - The build result ``bazel-bin/micro/codegen/libmicro_engine.so``'s abi is host, if you want to run the model on micro controllers, you should build the code with the target abi.
+    5. Run the model on host.
+    .. code-block:: sh
+        CONF_FILE=/path/to/mace-models/micro-models/har-cnn/har-cnn.yml
+        # Run
+        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build
+    	# Test model run time
+        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --round=100
+    	# Validate the correctness by comparing the results against the
+    	# original model and framework, measured with cosine distance for similarity.
+    	python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --validate
+        # Validate the layers' correctness.
+        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --validate --layers 0:-1
+Deploy your model into applications
+------------------------------------
+Please refer to \ ``/mace/micro/tools/micro_run.cc`` for full usage. The following list the key steps.
+.. code-block:: cpp
+    // Include the headers
+    #include "micro/include/public/micro.h"
+    // 1. Create MaceMicroEngine instance
+    MaceMicroEngine *micro_engine = nullptr;
+    MaceStatus status = har_cnn::GetMicroEngineSingleton(&micro_engine);
+    // 1. Create and register Input buffers
+    std::vector<std::shared_ptr<char>> inputs;
+    std::vector<int32_t> input_sizes;
+    for (size_t i = 0; i < input_shapes.size(); ++i) {
+      input_sizes.push_back(std::accumulate(input_shapes[i].begin(),
+                                            input_shapes[i].end(), sizeof(float),
+                                            std::multiplies<int32_t>()));
+      inputs.push_back(std::shared_ptr<char>(new char[input_sizes[i]],
+                                             std::default_delete<char[]>()));
+    }
+    // TODO: fill data into input buffers
+    for (size_t i = 0; i < input_names.size(); ++i) {
+      micro_engine->RegisterInputData(i, inputs[i].get(),
+                                      input_shapes[i].data());
+    }
+    // 3. Run the model
+    MaceStatus status = micro_engine->Run();
+    // 4. Get the results
+    for (size_t i = 0; i < output_names.size(); ++i) {
+      void *output_buffer = nullptr;
+      const int32_t *output_dims = nullptr;
+      uint32_t dim_size = 0;
+      MaceStatus status =
+          micro_engine->GetOutputData(i, &output_buffer, &output_dims, &dim_size);
+      // TODO: the result data is in output_buffer, you can not delete output_buffer.
+    }
--- a/mace/codegen/tools/gen_version_source.sh
+++ b/mace/codegen/tools/gen_version_source.sh
@@ -53,10 +53,14 @@ cat <<EOF > ${OUTPUT_FILENAME}
 // This is a generated file. DO NOT EDIT!
 namespace mace {
+namespace {
 #ifndef _MSC_VER
 __attribute__((visibility("default")))
 #endif
-const char *MaceVersion() { return "MACEVER-${GIT_VERSION}" + 8; }
+  const char *kMaceVersion = "MACEVER-${GIT_VERSION}";
+}  // namespace
+const char *MaceVersion() { return kMaceVersion + 8; }
 }  // namespace mace
 EOF
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -322,7 +322,8 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
      .TypeConstraint("T", key_dtype)
      .Build();
  if (registry_.at(op_type)->creators.count(key) == 0) {
-    LOG(FATAL) << "Key not registered: " << key;
+    LOG(FATAL) << "Key not registered: " << key
+               << ", op type is: " << operator_def->type();
  }
  return registry_.at(op_type)->creators.at(key)(context);
 }

--- a/mace/proto/BUILD.bazel
+++ b/mace/proto/BUILD.bazel
@@ -8,9 +8,11 @@ package(
 licenses(["notice"])  # Apache 2.0
-load("@com_google_protobuf//:protobuf.bzl",
+load(
-     "py_proto_library",
+    "@com_google_protobuf//:protobuf.bzl",
-     "cc_proto_library")
+    "cc_proto_library",
+    "py_proto_library",
+)
 py_proto_library(
    name = "mace_py",
@@ -27,3 +29,14 @@ cc_proto_library(
    default_runtime = "@com_google_protobuf//:protobuf_lite",
    protoc = "@com_google_protobuf//:protoc",
 )
+py_proto_library(
+    name = "micro_mem_py",
+    srcs = ["micro_mem.proto"],
+    default_runtime = "@com_google_protobuf//:protobuf_python",
+    protoc = "@com_google_protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = [
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -14,6 +14,7 @@ enum DataType {
  DT_HALF = 3;
  DT_INT32 = 4;
  DT_FLOAT16 = 5;
+  DT_BFLOAT16 = 6;
 }
 enum MemoryType {
@@ -76,6 +77,7 @@ message OperatorDef {
  repeated DataType output_type = 8;
  repeated QuantizeActivationInfo quantize_info = 9;
+  // for mace it is mem_id, for micro, it is mem_offset
  repeated int32 mem_id = 10;
  // for hexagon mace-nnlib

--- a/mace/proto/micro_mem.proto
+++ b/mace/proto/micro_mem.proto
+syntax = "proto2";
+package micro;
+message OutputShape {
+  repeated int64 dims = 1;
+}
+message OpContext {
+  optional int32 op_idx = 1;
+  // The input info of downstream operator is the output info of upstream
+  // operator, so there is no output info defined here
+  repeated uint32 input_infos = 2;
+  repeated OutputShape output_resize_shapes = 3;
+}
+message Graph {
+  repeated OpContext op_contexts = 1;
+  repeated uint32 input_op_idxs = 2;
+  // The output info of the last operator, which is not recorded in opcontext,
+  // is the output of graph
+  repeated uint32 output_infos = 3;
+}
--- a/micro/BUILD.bazel
+++ b/micro/BUILD.bazel
+config_setting(
+    name = "hexagon_enabled",
+    define_values = {
+        "hexagon": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/micro/base/BUILD.bazel
+++ b/micro/base/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "base_hdrs",
+    hdrs = glob([
+        "*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/include",
+        "//micro/port",
+    ],
+)
+cc_library(
+    name = "base",
+    srcs = glob(
+        [
+            "*.cc",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "base_hdrs",
+        "//micro/port",
+    ],
+)
--- a/micro/base/logger.cc
+++ b/micro/base/logger.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/logger.h"
+#include "micro/base/value_to_str.h"
+#include "micro/port/api.h"
+namespace micro {
+namespace base {
+namespace {
+const int32_t kInt64ValueBufferLength = 21;
+const int32_t kInt32ValueBufferLength = 12;
+const int32_t kInt16ValueBufferLength = 6;
+const int32_t kInt8ValueBufferLength = 4;
+const int32_t kFloatValueBufferLength = 21;
+inline bool IsValidLogLevel(const LogLevel level) {
+  return level >= CLEAN && level < INVALID_MAX;
+}
+char LogLevelToShortStr(LogLevel level) {
+  if (!IsValidLogLevel(level)) {
+    level = INFO;
+  }
+  return "CIWEF"[static_cast<int>(level)];
+}
+}  // namespace
+Logger::Logger(const char *fname, uint32_t line,
+               LogLevel severity) : severity_(severity) {
+  if (severity == CLEAN) {
+    return;
+  }
+  char buffer[15] = {0};
+  char *end = buffer + 15;
+  buffer[0] = LogLevelToShortStr(severity);
+  buffer[1] = ' ';
+  micro::port::api::DebugLog(buffer);
+  micro::port::api::DebugLog(fname);
+  buffer[0] = ':';
+  ToString("] ", ToString(line, buffer + 1, end), end);
+  micro::port::api::DebugLog(buffer);
+}
+Logger::~Logger() {
+  micro::port::api::DebugLog("\n");
+  if (severity_ == FATAL) {
+    micro::port::api::Abort();
+  }
+}
+const Logger &Logger::operator<<(const char *str) const {
+  micro::port::api::DebugLog(str);
+  return *this;
+}
+const Logger &Logger::operator<<(const char c) const {
+  char buffer[2] = {0};
+  buffer[0] = c;
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const float value) const {
+  char buffer[kFloatValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kFloatValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const int64_t value) const {
+  char buffer[kInt64ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt64ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const int32_t value) const {
+  char buffer[kInt32ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt32ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const uint32_t value) const {
+  char buffer[kInt32ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt32ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const int16_t value) const {
+  char buffer[kInt16ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt16ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const uint16_t value) const {
+  char buffer[kInt16ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt16ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const int8_t value) const {
+  char buffer[kInt8ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt8ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const uint8_t value) const {
+  char buffer[kInt8ValueBufferLength] = {0};
+  ToString(value, buffer, buffer + kInt8ValueBufferLength);
+  micro::port::api::DebugLog(buffer);
+  return *this;
+}
+const Logger &Logger::operator<<(const bool value) const {
+  if (value) {
+    micro::port::api::DebugLog("true");
+  } else {
+    micro::port::api::DebugLog("false");
+  }
+  return *this;
+}
+}  // namespace base
+}  // namespace micro
--- a/micro/base/logger.h
+++ b/micro/base/logger.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_LOGGER_H_
+#define MICRO_BASE_LOGGER_H_
+#include <stdint.h>
+namespace micro {
+enum LogLevel {
+  CLEAN = 0,
+  INFO = 1,
+  WARNING = 2,
+  ERROR = 3,
+  FATAL = 4,
+  INVALID_MAX,
+};
+namespace base {
+class Logger {
+ public:
+  Logger(const char *fname, uint32_t line, LogLevel severity);
+  ~Logger();
+  const Logger &operator<<(const char *str) const;
+  const Logger &operator<<(const char c) const;
+  const Logger &operator<<(const float value) const;
+  const Logger &operator<<(const int64_t value) const;
+  const Logger &operator<<(const int32_t value) const;
+  const Logger &operator<<(const uint32_t value) const;
+  const Logger &operator<<(const int16_t value) const;
+  const Logger &operator<<(const uint16_t value) const;
+  const Logger &operator<<(const int8_t value) const;
+  const Logger &operator<<(const uint8_t value) const;
+  const Logger &operator<<(const bool value) const;
+ private:
+  LogLevel severity_;
+};
+}  // namespace base
+}  // namespace micro
+#endif  // MICRO_BASE_LOGGER_H_
--- a/micro/base/logging.h
+++ b/micro/base/logging.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_LOGGING_H_
+#define MICRO_BASE_LOGGING_H_
+#include <stdint.h>
+#include "micro/base/logger.h"
+#include "micro/include/port/define.h"
+namespace micro {
+namespace log {
+#define LOG(severity) \
+  micro::base::Logger(__FILE__, __LINE__, micro::severity)
+#ifndef NDEBUG
+#define LOG1(severity, value) LOG(severity) << value
+#define LOG2(severity, value1, value2) LOG(severity) << value1 << value2
+#define LOG3(severity, value1, value2, value3) \
+  LOG(severity) << value1 << value2 << value3
+#define LOG4(severity, value1, value2, value3, value4) \
+  LOG(severity) << value1 << value2 << value3 << value4
+#define LOG5(severity, value1, value2, value3, value4, value5) \
+  LOG(severity) << value1 << value2 << value3 << value4 << value5
+#else
+#define LOG1(severity, value)
+#define LOG2(severity, value1, value2)
+#define LOG3(severity, value1, value2, value3)
+#define LOG4(severity, value1, value2, value3, value4)
+#define LOG5(severity, value1, value2, value3, value4, value5)
+#endif  // NDEBUG
+#ifndef NDEBUG
+#define MACE_ASSERT(condition) \
+  if (!(condition)) LOG(FATAL) << "Assert failed: "#condition  // NOLINT
+#define MACE_ASSERT1(condition, str) \
+  if (!(condition)) LOG(FATAL) << "Assert failed: "#condition " " << str   // NOLINT
+#define MACE_ASSERT2(condition, str1, str2) \
+  if (!(condition)) LOG(FATAL) << "Assert failed: "#condition " " << str1 << str2  // NOLINT
+#else
+#define MACE_ASSERT(condition)
+#define MACE_ASSERT1(condition, string)
+#define MACE_ASSERT2(condition, string1, string2)
+#endif  // NDEBUG
+#define MACE_NOT_IMPLEMENTED MACE_ASSERT1(false, "not implemented")
+#define MACE_CHECK_SUCCESS(stmt)                    \
+  {                                                 \
+    MaceStatus status = (stmt);                     \
+    if (status != MACE_SUCCESS) {                   \
+      LOG(FATAL) << #stmt << " failed with error: " \
+              << status;                            \
+    }                                               \
+  }
+#define MACE_RETURN_IF_ERROR(stmt)               \
+  {                                              \
+    MaceStatus status = (stmt);                  \
+    if (status != MACE_SUCCESS) {                \
+      LOG(INFO) << static_cast<int32_t>(stmt)    \
+                << " failed with error: "        \
+                << static_cast<int32_t>(status); \
+      return status;                             \
+    }                                            \
+  }
+}  // namespace log
+}  // namespace micro
+#endif  // MICRO_BASE_LOGGING_H_
--- a/micro/base/serialize.cc
+++ b/micro/base/serialize.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/serialize.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+namespace micro {
+#ifdef MACE_WRITE_MAGIC
+SerialUint32 Serialize::GetMagic() const {
+  return magic_;
+}
+SerialUint32 Serialize::Magic(const char *bytes4) const {
+  MACE_ASSERT1(micro::base::strlen(bytes4) >= 4, "The magic bytes must >= 4.");
+  SerialUint32 magic = 0;
+  for (int32_t i = 0; i < 32 && (*bytes4) != '\0'; i += 8, ++bytes4) {
+    magic += (*bytes4) << i;
+  }
+  return magic;
+}
+MaceStatus Serialize::MagicToString(SerialUint32 magic,
+                                    char (&array)[5]) const {
+  char *buffer = array;
+  for (int32_t i = 0; i <32; i += 8, ++buffer) {
+    *buffer = (magic >> i) & 0x000000ff;
+  }
+  *buffer = '\0';
+  return MACE_SUCCESS;
+}
+#endif  // MACE_WRITE_MAGIC
+void Serialize::Uint2OpIOInfo(const OpIOInfo *info) const {
+  OpIOInfo *io_info = const_cast<OpIOInfo *>(info);
+  uint32_t info_data = *(reinterpret_cast<uint32_t *>(io_info));
+  io_info->op_def_idx_ = (info_data & 0xffff0000) >> 16;
+  io_info->output_idx_ = (info_data & 0x0000ffff);
+}
+}  // namespace micro
--- a/micro/base/serialize.h
+++ b/micro/base/serialize.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_SERIALIZE_H_
+#define MICRO_BASE_SERIALIZE_H_
+#include <stdint.h>
+#include "micro/base/serialize_type.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+#ifdef MACE_WRITE_MAGIC
+#ifndef MACE_DEFINE_HARD_CODE_MAGIC
+#define MACE_DEFINE_HARD_CODE_MAGIC(CLASS_NAME) \
+SerialUint32 GetHardCodeMagic() const {    \
+  return Magic(#CLASS_NAME);               \
+}
+#endif  // MACE_DEFINE_HARD_CODE_MAGIC
+#else
+#ifndef MACE_DEFINE_HARD_CODE_MAGIC
+#define MACE_DEFINE_HARD_CODE_MAGIC(CLASS_NAME)
+#endif  // MACE_DEFINE_HARD_CODE_MAGIC
+#endif  // MACE_WRITE_MAGIC
+// We describe a tensor as an output tensor, but it can also
+// be used to represent an input tensor.
+struct OpIOInfo {
+  uint16_t op_def_idx_;
+  uint16_t output_idx_;
+};
+class Serialize {
+#ifdef MACE_WRITE_MAGIC
+ public:
+  SerialUint32 GetMagic() const;
+  MaceStatus MagicToString(SerialUint32 magic, char (&array)[5]) const;
+ protected:
+  SerialUint32 magic_;
+ protected:
+  SerialUint32 Magic(const char *bytes4) const;
+#endif  // MACE_WRITE_MAGIC
+ public:
+  void Uint2OpIOInfo(const OpIOInfo *output_info) const;
+};
+}  // namespace micro
+#endif  // MICRO_BASE_SERIALIZE_H_
--- a/micro/base/serialize_type.h
+++ b/micro/base/serialize_type.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_SERIALIZE_TYPE_H_
+#define MICRO_BASE_SERIALIZE_TYPE_H_
+#include <stdint.h>
+#include "micro/include/public/micro.h"
+namespace micro {
+#ifdef MACE_OFFSET_USE_16
+typedef uint16_t offset_size_t;
+#else
+typedef uint32_t offset_size_t;
+#endif  // MACE_OFFSET_USE_16
+template<typename T>
+struct SerialArray {
+  offset_size_t size_;
+  offset_size_t offset_;
+  SerialArray() : size_(0), offset_(0) {}
+};
+struct SerialString {
+  offset_size_t packed_length_;
+  offset_size_t offset_;
+  SerialString() : packed_length_(0), offset_(0) {}
+};
+struct SerialBytes {
+  offset_size_t packed_length_;
+  offset_size_t offset_;
+  SerialBytes() : packed_length_(0), offset_(0) {}
+};
+typedef float SerialFloat;
+typedef int32_t SerialInt32;
+typedef uint32_t SerialUint32;
+typedef uint32_t SerialBool;
+typedef int32_t SerialInt16;
+typedef uint32_t SerialUint16;
+typedef int32_t SerialInt8;
+typedef uint32_t SerialUint8;
+#ifndef MACE_DECLARE_OBJECT_FUNC
+#define MACE_DECLARE_OBJECT_FUNC(T, OBJECT_NAME) \
+  T OBJECT_NAME() const;
+#endif  // MACE_DECLARE_OBJECT_FUNC
+#ifndef MACE_DEFINE_OBJECT_FUNC
+#define MACE_DEFINE_OBJECT_FUNC(CLASS_NAME, T, OBJECT_NAME) \
+  T CLASS_NAME::OBJECT_NAME() const {                       \
+    return OBJECT_NAME##_;                                  \
+  }
+#endif  // MACE_DEFINE_OBJECT_FUNC
+#ifndef MACE_MACE_DECLARE_PTR_FUNC
+#define MACE_DECLARE_PTR_FUNC(T, OBJECT_NAME) \
+  const T *OBJECT_NAME() const;
+#endif  // MACE_DECLARE_PTR_FUNC
+#ifndef MACE_DEFINE_PTR_FUNC
+#define MACE_DEFINE_PTR_FUNC(CLASS_NAME, T, OBJECT_NAME) \
+  const T *CLASS_NAME::OBJECT_NAME() const {             \
+    return &OBJECT_NAME##_;                              \
+  }
+#endif  // MACE_DEFINE_PTR_FUNC
+#ifndef MACE_DECLARE_ARRAY_FUNC
+#define MACE_DECLARE_ARRAY_FUNC(T, OBJECT_NAME) \
+  T OBJECT_NAME(uint32_t index) const;          \
+  uint32_t OBJECT_NAME##_size() const
+#endif  // MACE_DECLARE_ARRAY_FUNC
+#ifndef MACE_DECLARE_ARRAY_BASE_PTR_FUNC
+#define MACE_DECLARE_ARRAY_BASE_PTR_FUNC(T, OBJECT_NAME) \
+  const T * OBJECT_NAME() const
+#endif  // MACE_DECLARE_ARRAY_BASE_PTR_FUNC
+#ifndef MACE_DEFINE_ARRAY_BASE_PTR_FUNC
+#define MACE_DEFINE_ARRAY_BASE_PTR_FUNC(                               \
+          CLASS_NAME, T, OBJECT_NAME, ARRAY_NAME)                      \
+  const T *CLASS_NAME::OBJECT_NAME() const {                           \
+    const T *array = reinterpret_cast<const T *>(                      \
+        reinterpret_cast<const uint8_t *>(this) + ARRAY_NAME.offset_); \
+    return array;                                                      \
+  }
+#endif  // MACE_DEFINE_ARRAY_BASE_PTR_FUNC
+#ifndef MACE_DEFINE_ARRAY_FUNC
+#define MACE_DEFINE_ARRAY_FUNC(CLASS_NAME, T, OBJECT_NAME, ARRAY_NAME) \
+  T CLASS_NAME::OBJECT_NAME(uint32_t index) const {                    \
+    const T *array = reinterpret_cast<const T *>(                      \
+        reinterpret_cast<const uint8_t *>(this) + ARRAY_NAME.offset_); \
+    return *(array + index);                                           \
+  }                                                                    \
+  uint32_t CLASS_NAME::OBJECT_NAME##_size() const {                    \
+    return ARRAY_NAME.size_;                                           \
+  }
+#endif  // MACE_DEFINE_ARRAY_FUNC
+#ifndef MACE_DECLARE_PTR_ARRAY_FUNC
+#define MACE_DECLARE_PTR_ARRAY_FUNC(T, OBJECT_NAME) \
+  const T *OBJECT_NAME(uint32_t index) const;       \
+  uint32_t OBJECT_NAME##_size() const
+#endif  // MACE_DECLARE_PTR_ARRAY_FUNC
+#ifndef MACE_DEFINE_PTR_ARRAY_FUNC
+#define MACE_DEFINE_PTR_ARRAY_FUNC(CLASS_NAME, T, OBJECT_NAME, ARRAY_NAME) \
+  const T *CLASS_NAME::OBJECT_NAME(uint32_t index) const {                 \
+    const T *array = reinterpret_cast<const T *>(                          \
+        reinterpret_cast<const uint8_t *>(this) + ARRAY_NAME.offset_);     \
+    return (array + index);                                                \
+  }                                                                        \
+                                                                           \
+  uint32_t CLASS_NAME::OBJECT_NAME##_size() const {                        \
+    return ARRAY_NAME.size_;                                               \
+  }
+#endif  // MACE_DEFINE_PTR_ARRAY_FUNC
+#ifndef MACE_DECLARE_STRING_FUNC
+#define MACE_DECLARE_STRING_FUNC(OBJECT_NAME) \
+  const char *OBJECT_NAME() const;
+#endif  // MACE_DECLARE_STRING_FUNC
+#ifndef MACE_DEFINE_STRING_FUNC
+#define MACE_DEFINE_STRING_FUNC(CLASS_NAME, OBJECT_NAME, STRING_NAME)    \
+  const char *CLASS_NAME::OBJECT_NAME() const {                          \
+    if (STRING_NAME.packed_length_ == 0) {                               \
+      return NULL;                                                       \
+    } else {                                                             \
+      return reinterpret_cast<const char *>(this) + STRING_NAME.offset_; \
+    }                                                                    \
+  }
+#endif  // MACE_DEFINE_STRING_FUNC
+#ifndef MACE_DECLARE_BYTES_FUNC
+#define MACE_DECLARE_BYTES_FUNC(OBJECT_NAME) \
+  const uint8_t *OBJECT_NAME() const;        \
+  uint32_t OBJECT_NAME##_size() const
+#endif  // MACE_DECLARE_BYTES_FUNC
+#ifndef MACE_DEFINE_BYTES_FUNC
+#define MACE_DEFINE_BYTES_FUNC(CLASS_NAME, OBJECT_NAME, BYTES_NAME)        \
+  const uint8_t *CLASS_NAME::OBJECT_NAME() const {                         \
+    if (BYTES_NAME.packed_length_ == 0) {                                  \
+        return NULL;                                                       \
+    } else {                                                               \
+      return reinterpret_cast<const uint8_t *>(this) + BYTES_NAME.offset_; \
+    }                                                                      \
+  }                                                                        \
+                                                                           \
+  uint32_t CLASS_NAME::OBJECT_NAME##_size() const {                        \
+    return BYTES_NAME.packed_length_;                                      \
+  }
+#endif  // MACE_DEFINE_BYTES_FUNC
+#ifndef MACE_DECLARE_STRING_ARRAY_FUNC
+#define MACE_DECLARE_STRING_ARRAY_FUNC(OBJECT_NAME)   \
+  const char *OBJECT_NAME(uint32_t index) const; \
+  uint32_t OBJECT_NAME##_size() const
+#endif
+#ifndef MACE_DEFINE_STRING_ARRAY_FUNC
+#define MACE_DEFINE_STRING_ARRAY_FUNC(CLASS_NAME, OBJECT_NAME, ARRAY_NAME) \
+  const char *CLASS_NAME::OBJECT_NAME(uint32_t index) const {              \
+    const SerialString *array = reinterpret_cast<const SerialString *>(    \
+        reinterpret_cast<const char *>(this) + ARRAY_NAME.offset_);        \
+    const SerialString *serial_str = array + index;                        \
+    const char *str = reinterpret_cast<const char *>(serial_str) +         \
+        serial_str->offset_;                                               \
+    return str;                                                            \
+  }                                                                        \
+                                                                           \
+  uint32_t CLASS_NAME::OBJECT_NAME##_size() const {                        \
+    return ARRAY_NAME.size_;                                               \
+  }
+#endif  // MACE_DEFINE_STRING_ARRAY_FUNC
+}  // namespace micro
+#endif  // MICRO_BASE_SERIALIZE_TYPE_H_
--- a/micro/base/types.h
+++ b/micro/base/types.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_TYPES_H_
+#define MICRO_BASE_TYPES_H_
+#include "micro/include/public/micro.h"
+#include "micro/include/utils/bfloat16.h"
+namespace micro {
+#ifdef MACE_ENABLE_BFLOAT16
+typedef BFloat16 mifloat;
+#else
+typedef float mifloat;
+#endif  // MACE_ENABLE_BFLOAT16
+template<class T>
+struct DataTypeToEnum;
+template<DataType VALUE>
+struct EnumToDataType;
+#ifndef MACE_MAPPING_DATA_TYPE_AND_ENUM
+#define MACE_MAPPING_DATA_TYPE_AND_ENUM(DATA_TYPE, ENUM_VALUE)  \
+  template <>                                                   \
+  struct DataTypeToEnum<DATA_TYPE> {                            \
+    static DataType v() { return ENUM_VALUE; }                  \
+    static const DataType value = ENUM_VALUE;                   \
+  };                                                            \
+  template <>                                                   \
+  struct EnumToDataType<ENUM_VALUE> {                           \
+    typedef DATA_TYPE Type;                                     \
+  };
+#endif  // MACE_MAPPING_DATA_TYPE_AND_ENUM
+MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
+MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
+#ifdef MACE_ENABLE_BFLOAT16
+MACE_MAPPING_DATA_TYPE_AND_ENUM(BFloat16, DT_BFLOAT16);
+#endif
+}  // namespace micro
+#endif  // MICRO_BASE_TYPES_H_
--- a/micro/base/utils.cc
+++ b/micro/base/utils.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/utils.h"
+#include <math.h>
+#include "micro/base/logging.h"
+namespace micro {
+namespace base {
+uint32_t strlen(const char *str) {
+  MACE_ASSERT1(str != NULL, "str can not be NULL.");
+  uint32_t length = 0;
+  while (*str++ != '\0') {
+    ++length;
+  }
+  return length;
+}
+int32_t strcmp(const char *str1, const char *str2) {
+  MACE_ASSERT1(str1 != NULL && str2 != NULL,
+               "strcmp str can not be NULL.");
+  while (*str1 == *str2) {
+    if (*str1 == '\0') {
+      return 0;
+    }
+    ++str1;
+    ++str2;
+  }
+  return (*str1) - (*str2);
+}
+void memcpy(void *dst, const void *src, uint32_t bytes) {
+  MACE_ASSERT1(dst != NULL && src != NULL && bytes > 0,
+               "Invalid params.");
+  uint8_t *dst_mem = static_cast<uint8_t *>(dst);
+  const uint8_t *src_mem = static_cast<const uint8_t *>(src);
+  while (bytes-- > 0) {
+    *dst_mem++ = *src_mem++;
+  }
+}
+int32_t GetShapeSize(uint32_t dim_size, const int32_t *dims) {
+  return accumulate_multi(dims, 0, dim_size);
+}
+float sqrt(float x) {
+  return ::sqrt(x);
+}
+int32_t ceil(float f) {
+  int32_t i = (int32_t) f;
+  return (f == static_cast<float>(i)) ? i : i + 1;
+}
+int32_t floor(float f) {
+  return ::floor(f);
+}
+float fabs(float x) {
+  if (x < 0.0f) {
+    return -x;
+  } else if (x > 0.0f) {
+    return x;
+  } else {
+    return 0.0f;
+  }
+}
+float lowest() {
+  return -3.402823466e+38F;
+}
+float highest() {
+  return 3.402823466e+38F;
+}
+float tanh(float x) {
+  return ::tanh(x);
+}
+float exp(float x) {
+  return ::exp(x);
+}
+float pow(float x, float y) {
+  return ::pow(x, y);
+}
+float log(float x) {
+  return ::log(x);
+}
+}  // namespace base
+}  // namespace micro
--- a/micro/base/utils.h
+++ b/micro/base/utils.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_UTILS_H_
+#define MICRO_BASE_UTILS_H_
+#include <stdint.h>
+#include "micro/base/logging.h"
+namespace micro {
+namespace base {
+uint32_t strlen(const char *str);
+int32_t strcmp(const char *str1, const char *str2);
+void memcpy(void *dst, const void *src, uint32_t bytes);
+int32_t GetShapeSize(uint32_t dim_size, const int32_t *dims);
+float sqrt(float x);
+int32_t ceil(float f);
+int32_t floor(float f);
+float fabs(float x);
+float lowest();
+float highest();
+float tanh(float x);
+float exp(float x);
+float pow(float x, float y);
+float log(float x);
+template<typename T>
+void memset(T *src, T value, uint32_t size) {
+  for (uint32_t i = 0; i < size; ++i) {
+    src[i] = value;
+  }
+}
+template<typename T>
+T accumulate_multi(const T *array, uint32_t array_start, uint32_t array_end) {
+  MACE_ASSERT(array_start >= 0 && array_start <= array_end);
+  if (array == NULL || array_start == array_end) {
+    return 1;
+  }
+  T total = array[array_start];
+  for (uint32_t i = array_start + 1; i < array_end; ++i) {
+    total *= array[i];
+  }
+  return total;
+}
+template<typename T>
+T abs(T x) {
+  return x > 0 ? x : -x;
+}
+template<typename T>
+T max(T a, T b) {
+  return a > b ? a : b;
+}
+template<typename T>
+T min(T a, T b) {
+  return a < b ? a : b;
+}
+template<typename T>
+void swap(T *a, T *b) {  // NOLINT
+  T c = *a;
+  *a = *b;
+  *b = c;
+}
+template<typename T>
+T clamp(T in, T low, T high) {
+  return max<T>(low, min<T>(in, high));  // NOLINT
+}
+}  // namespace base
+}  // namespace micro
+#endif  // MICRO_BASE_UTILS_H_
--- a/micro/base/value_to_str.cc
+++ b/micro/base/value_to_str.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/value_to_str.h"
+namespace micro {
+namespace base {
+#ifndef MACE_SIGNED_TO_STRING
+#define MACE_SIGNED_TO_STRING(T, UNSIGNED_T)                    \
+template<>                                                      \
+char *ToString(T value, char *buffer, char *end) {              \
+  if (value < 0) {                                              \
+    value = -value;                                             \
+    *buffer++ = '-';                                            \
+  }                                                             \
+  return ToString(static_cast<UNSIGNED_T>(value), buffer, end); \
+}
+#endif  // MACE_SIGNED_TO_STRING
+void ReverseInplace(char *start, char *end) {
+  end--;
+  while (start < end) {
+    char tmp = *start;
+    *start++ = *end;
+    *end-- = tmp;
+  }
+}
+MACE_SIGNED_TO_STRING(int64_t, uint64_t)
+MACE_SIGNED_TO_STRING(int32_t, uint32_t)
+MACE_SIGNED_TO_STRING(int16_t, uint16_t)
+MACE_SIGNED_TO_STRING(int8_t, uint8_t)
+template<>
+char *ToString(const char *str, char *buffer, char *end) {
+  end--;
+  while (*str != '\0' && buffer < end) {
+    *buffer++ = *str++;
+  }
+  *buffer = '\0';
+  return buffer;
+}
+template<>
+char *ToString(float value, char *buffer, char *end) {
+  if (value <= -1e-8) {
+    *buffer++ = '-';
+  }
+  int32_t int_part = (int32_t) value;
+  buffer = ToString(int_part, buffer, end);
+  float deci_part = value - int_part;
+  if (deci_part < 1e-8 && deci_part > -1e-8) {
+    return buffer;
+  }
+  if (deci_part < 0.0) {
+    deci_part = -deci_part;
+  }
+  end--;
+  *buffer++ = '.';
+  do {
+    deci_part *= 10;
+    int32_t remainder = (int32_t) deci_part;
+    *buffer++ = '0' + remainder;
+    deci_part -= remainder;
+  } while (deci_part > 0 && buffer < end);
+  *buffer = '\0';
+  return buffer;
+}
+}  // namespace base
+}  // namespace micro
--- a/micro/base/value_to_str.h
+++ b/micro/base/value_to_str.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_BASE_VALUE_TO_STR_H_
+#define MICRO_BASE_VALUE_TO_STR_H_
+#include <stdint.h>
+namespace micro {
+namespace base {
+void ReverseInplace(char *start, char *end);
+// for uint64_t/uint32_t/uint16_t/uint8_t
+template<typename T>
+char *ToString(T value, char *buffer, char *end) {
+  char *start = buffer;
+  end--;
+  do {
+    *buffer++ = '0' + (value % 10);
+    value /= 10;
+  } while (value > 0 && buffer < end);
+  ReverseInplace(start, buffer);
+  *buffer = '\0';
+  return buffer;
+}
+template<>
+char *ToString(int64_t value, char *buffer, char *end);
+template<>
+char *ToString(int32_t value, char *buffer, char *end);
+template<>
+char *ToString(int16_t value, char *buffer, char *end);
+template<>
+char *ToString(int8_t value, char *buffer, char *end);
+template<>
+char *ToString(const char *str, char *buffer, char *end);
+template<>
+char *ToString(float value, char *buffer, char *end);
+}  // namespace base
+}  // namespace micro
+#endif  // MICRO_BASE_VALUE_TO_STR_H_
--- a/micro/codegen/BUILD.bazel
+++ b/micro/codegen/BUILD.bazel
+# Description:
+# Generated model and runtime code.
+#
+package(
+    default_visibility = ["//visibility:public"],
+)
+cc_library(
+    name = "generated_models",
+    srcs = glob(["models/**/*.cc"]),
+    hdrs = glob(["models/**/*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/framework",
+        "//micro/include",
+        "//micro/model",
+        "//micro/ops",
+    ],
+)
+cc_library(
+    name = "micro_engine_c",
+    srcs = glob(["micro/codegen/engines/**/micro_engine_c_interface.cc"]),
+    hdrs = glob(["micro/codegen/engines/**/micro_engine_c_interface.cc"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        ":micro_engine",
+    ],
+    alwayslink = 1,
+)
+cc_library(
+    name = "micro_engine",
+    srcs = glob(
+        ["engines/**/*.cc"],
+        exclude = ["micro/codegen/engines/**/micro_engine_c_interface.cc"],
+    ),
+    hdrs = glob(
+        [
+            "engines/**/*.h",
+        ],
+        exclude = ["micro/codegen/engines/**/micro_engine_c_interface.cc"],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "generated_models",
+        "//micro/framework",
+        "//micro/model",
+        "//micro/ops",
+    ],
+    alwayslink = 1,
+)
+cc_binary(
+    name = "libmicro.so",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = [
+        ":micro_engine",
+    ],
+)
+cc_binary(
+    name = "libmicro.lo",
+    linkshared = False,
+    linkstatic = True,
+    deps = [
+        ":micro_engine",
+    ],
+)
--- a/micro/framework/BUILD.bazel
+++ b/micro/framework/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "framework",
+    srcs = glob(["*.cc"]),
+    hdrs = glob(["*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/base",
+        "//micro/include",
+        "//micro/model",
+    ],
+)
+cc_library(
+    name = "framework_for_optest",
+    srcs = glob(
+        ["*.cc"],
+        exclude = ["operator.cc"],
+    ),
+    hdrs = glob(["*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/base",
+        "//micro/include",
+        "//micro/model",
+    ],
+)
--- a/micro/framework/graph.cc
+++ b/micro/framework/graph.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/framework/graph.h"
+#include "micro/base/logging.h"
+#include "micro/base/serialize.h"
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/include/public/micro.h"
+#include "micro/model/net_def.h"
+namespace micro {
+namespace framework {
+MACE_DEFINE_PTR_ARRAY_FUNC(Graph, OpContext, op_context, op_contexts_)
+MACE_DEFINE_ARRAY_FUNC(Graph, uint32_t, input_op_idx, input_op_idxs_);
+MACE_DEFINE_PTR_ARRAY_FUNC(Graph, OpIOInfo, output_info, output_infos_);
+MaceStatus Graph::Init(MaceMicroEngineConfig *engine_config) {
+  MACE_ASSERT(engine_config->net_def_->op_size() == op_context_size());
+  uint32_t output_info_size = this->output_info_size();
+  for (uint32_t i = 0; i < output_info_size; ++i) {
+    Uint2OpIOInfo(this->output_info(i));
+  }
+  uint32_t op_size = engine_config->net_def_->op_size();
+  for (uint32_t i = 0; i < op_size; ++i) {
+    OpContext *op_ctx = const_cast<OpContext *>(op_context(i));
+    MACE_RETURN_IF_ERROR(op_ctx->Init(
+        engine_config, engine_config->net_def_->op(i)));
+  }
+  return MACE_SUCCESS;
+}
+MaceStatus Graph::RegisterInputData(MaceMicroEngineConfig *engine_config,
+                                    uint32_t idx,
+                                    const void *input_buffer,
+                                    const int32_t *input_dims) {
+  engine_config->input_buffers_[idx] = input_buffer;
+  engine_config->input_shapes_[idx] = input_dims;
+  // update the op's input buffers
+  uint32_t op_idx = input_op_idx(idx);
+  framework::Operator *input_op = engine_config->op_array_[op_idx];
+  return input_op->OnInit();
+}
+MaceStatus Graph::Run(MaceMicroEngineConfig *engine_config) {
+  uint32_t op_size = engine_config->net_def_->op_size();
+  for (uint32_t i = 0; i < op_size; ++i) {
+    OpContext *op_ctx = const_cast<OpContext *>(op_context(i));
+    MACE_RETURN_IF_ERROR(op_ctx->Run(engine_config));
+  }
+  return MACE_SUCCESS;
+}
+MaceStatus Graph::GetOutputData(MaceMicroEngineConfig *engine_config,
+                                const uint32_t idx,
+                                void **output_data,
+                                const int32_t **output_dims,
+                                uint32_t *output_dim_size) {
+  MACE_ASSERT(idx < output_info_size());
+  const OpIOInfo *o_info = output_info(idx);
+  return GetOpOutputData(engine_config, o_info->op_def_idx_,
+                         o_info->output_idx_, output_data,
+                         output_dims, output_dim_size);
+}
+MaceStatus Graph::GetOpOutputData(MaceMicroEngineConfig *engine_config,
+                                  const uint32_t op_def_idx,
+                                  const uint32_t output_idx,
+                                  void **output_data,
+                                  const int32_t **output_dims,
+                                  uint32_t *output_dim_size) {
+  MACE_ASSERT(engine_config != NULL);
+  MACE_ASSERT(output_data != NULL);
+  MACE_ASSERT(output_dims != NULL);
+  MACE_ASSERT(output_dim_size != NULL);
+  const model::OperatorDef *op_def = engine_config->net_def_->op(op_def_idx);
+  *output_data = engine_config->tensor_mem_ + op_def->mem_offset(output_idx);
+  const model::OutputShape *output_shape =
+      op_context(op_def_idx)->output_resize_shape(output_idx);
+  *output_dims = output_shape->dim();
+  *output_dim_size = output_shape->dim_size();
+  return MACE_SUCCESS;
+}
+}  // namespace framework
+}  // namespace micro
--- a/micro/framework/graph.h
+++ b/micro/framework/graph.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_FRAMEWORK_GRAPH_H_
+#define MICRO_FRAMEWORK_GRAPH_H_
+#include "micro/base/serialize.h"
+#include "micro/framework/op_context.h"
+namespace micro {
+struct MaceMicroEngineConfig;
+namespace framework {
+class Graph : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(Graph)
+  MACE_DECLARE_PTR_ARRAY_FUNC(OpContext, op_context);
+  MACE_DECLARE_ARRAY_FUNC(uint32_t, input_op_idx);
+  MACE_DECLARE_PTR_ARRAY_FUNC(OpIOInfo, output_info);
+  MaceStatus Init(MaceMicroEngineConfig *engine_config);
+  MaceStatus RegisterInputData(MaceMicroEngineConfig *engine_config,
+                               uint32_t idx,
+                               const void *input_buffer,
+                               const int32_t *input_dims);
+  MaceStatus Run(MaceMicroEngineConfig *engine_config);
+  MaceStatus GetOutputData(MaceMicroEngineConfig *engine_config,
+                           const uint32_t idx,
+                           void **output_data,
+                           const int32_t **output_dims,
+                           uint32_t *output_dim_size);
+  MaceStatus GetOpOutputData(MaceMicroEngineConfig *engine_config,
+                             const uint32_t op_def_idx,
+                             const uint32_t output_idx,
+                             void **output_data,
+                             const int32_t **output_dims,
+                             uint32_t *output_dim_size);
+ protected:
+  SerialArray<OpContext> op_contexts_;
+  SerialArray<SerialUint32> input_op_idxs_;
+  SerialArray<OpIOInfo> output_infos_;
+};
+}  // namespace framework
+}  // namespace micro
+#endif  // MICRO_FRAMEWORK_GRAPH_H_
--- a/micro/framework/micro_engine.cc
+++ b/micro/framework/micro_engine.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/graph.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/include/public/micro.h"
+#include "micro/model/net_def.h"
+#include "micro/model/operator_def.h"
+#include "micro/port/api.h"
+namespace micro {
+MaceStatus MaceMicroEngine::Init(MaceMicroEngineConfig *engine_config) {
+  MACE_ASSERT(engine_config != NULL && engine_config->net_def_ != NULL
+                  && engine_config->model_data_ != NULL
+                  && engine_config->graph_ != NULL
+                  && engine_config->op_array_ != NULL
+                  && engine_config->tensor_mem_ != NULL);
+  engine_config_ = engine_config;
+  MACE_RETURN_IF_ERROR(engine_config_->graph_->Init(engine_config_));
+  return MACE_SUCCESS;
+}
+MaceStatus MaceMicroEngine::RegisterInputData(uint32_t idx,
+                                              const void *input_buffer,
+                                              const int32_t *input_dims) {
+  MACE_ASSERT(idx < engine_config_->net_def_->input_info_size());
+  MACE_ASSERT(input_buffer != NULL);
+  MACE_ASSERT(input_dims != NULL);
+  return engine_config_->graph_->RegisterInputData(engine_config_, idx,
+                                                   input_buffer, input_dims);
+}
+MaceStatus MaceMicroEngine::Run() {
+  return engine_config_->graph_->Run(engine_config_);
+}
+MaceStatus MaceMicroEngine::GetOutputData(const uint32_t idx,
+                                          void **output_data,
+                                          const int32_t **output_dims,
+                                          uint32_t *output_dim_size) {
+  return engine_config_->graph_->GetOutputData(engine_config_, idx,
+                                               output_data, output_dims,
+                                               output_dim_size);
+}
+MaceStatus MaceMicroEngine::GetOpOutputData(const uint32_t op_def_idx,
+                                            const uint32_t output_idx,
+                                            void **output_data,
+                                            const int32_t **output_dims,
+                                            uint32_t *output_dim_size) {
+  return engine_config_->graph_->GetOpOutputData(engine_config_, op_def_idx,
+                                                 output_idx, output_data,
+                                                 output_dims, output_dim_size);
+}
+MaceMicroEngine::MaceMicroEngine(const MaceMicroEngine &) {
+  MACE_NOT_IMPLEMENTED;
+}
+MaceMicroEngine &MaceMicroEngine::operator=(const MaceMicroEngine &) {
+  MACE_NOT_IMPLEMENTED;
+  return *this;
+}
+}  // namespace micro
--- a/micro/framework/op_context.cc
+++ b/micro/framework/op_context.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/framework/op_context.h"
+#include "micro/framework/operator.h"
+#include "micro/model/net_def.h"
+#include "micro/model/operator_def.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace framework {
+MACE_DEFINE_OBJECT_FUNC(OpContext, uint32_t, op_idx)
+MACE_DEFINE_PTR_ARRAY_FUNC(OpContext, OpIOInfo, input_info, input_infos_)
+MACE_DEFINE_PTR_ARRAY_FUNC(OpContext, model::OutputShape,
+                      output_resize_shape, output_resize_shapes_)
+MaceStatus OpContext::Init(MaceMicroEngineConfig *engine_config,
+                           const model::OperatorDef *op_def) {
+  // init OpContext
+  uint32_t input_info_size = this->input_info_size();
+  for (uint32_t i = 0; i < input_info_size; ++i) {
+    Uint2OpIOInfo(this->input_info(i));
+  }
+  // init Op
+  uint32_t op_i = op_idx();
+  MACE_RETURN_IF_ERROR(
+      engine_config->op_array_[op_i]->Init(engine_config, this, op_def));
+  return MACE_SUCCESS;
+}
+MaceStatus OpContext::Run(MaceMicroEngineConfig *engine_config) {
+  return engine_config->op_array_[op_idx()]->Run();
+}
+}  // namespace framework
+}  // namespace micro
--- a/micro/framework/op_context.h
+++ b/micro/framework/op_context.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_FRAMEWORK_OP_CONTEXT_H_
+#define MICRO_FRAMEWORK_OP_CONTEXT_H_
+#include "micro/base/serialize.h"
+#include "micro/model/operator_def.h"
+#include "micro/model/output_shape.h"
+namespace micro {
+struct MaceMicroEngineConfig;
+namespace framework {
+class Operator;
+class OpContext : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(OpContext)
+  MACE_DECLARE_OBJECT_FUNC(uint32_t, op_idx);
+  MACE_DECLARE_PTR_ARRAY_FUNC(OpIOInfo, input_info);
+  MACE_DECLARE_PTR_ARRAY_FUNC(model::OutputShape, output_resize_shape);
+  MaceStatus Init(MaceMicroEngineConfig *engine_config,
+                  const model::OperatorDef *op_def);
+  MaceStatus Run(MaceMicroEngineConfig *engine_config);
+ protected:
+  SerialUint32 op_idx_;
+  SerialArray<OpIOInfo> input_infos_;
+  SerialArray<model::OutputShape> output_resize_shapes_;
+};
+}  // namespace framework
+}  // namespace micro
+#endif  // MICRO_FRAMEWORK_OP_CONTEXT_H_
--- a/micro/framework/operator.cc
+++ b/micro/framework/operator.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/framework/operator.h"
+#include "micro/base/utils.h"
+#include "micro/framework/op_context.h"
+#include "micro/include/port/define.h"
+#include "micro/include/public/micro.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/input_output_info.h"
+#include "micro/model/net_def.h"
+#include "micro/model/operator_def.h"
+namespace micro {
+namespace framework {
+namespace {
+const uint16_t kIdxConstTensor = 0xffff;
+const uint16_t kIdxModelInput = 0xfffe;
+}  // namespace
+Operator::~Operator() {}
+MaceStatus Operator::Init(MaceMicroEngineConfig *engine_config,
+                          framework::OpContext *op_context,
+                          const model::OperatorDef *op_def) {
+  engine_config_ = engine_config;
+  op_context_ = op_context;
+  op_def_ = op_def;
+  MACE_ASSERT1(op_def_->input_size() == op_context_->input_info_size(),
+               "op_def_'s input dosen't match the op_context_'s");
+  MACE_ASSERT1(
+      op_def_->output_size() == op_context_->output_resize_shape_size(),
+      "op_def_'s output dosen't match the op_context_'s");
+  return OnInit();
+}
+MaceStatus Operator::Run() {
+  MACE_NOT_IMPLEMENTED;
+  return MACE_SUCCESS;
+}
+MaceStatus Operator::OnInit() {
+  return MACE_SUCCESS;
+}
+const model::Argument *Operator::GetArgByName(const char *name) const {
+  MACE_ASSERT(op_def_ != NULL);
+  for (uint32_t i = 0; i < op_def_->arg_size(); ++i) {
+    const model::Argument *argument = op_def_->arg(i);
+    if (base::strcmp(name, argument->name()) == 0) {
+      return argument;
+    }
+  }
+  return NULL;
+}
+uint32_t Operator::GetInputSize() {
+  return op_def_->input_size();
+}
+const void *Operator::DoGetInputData(uint32_t idx) {
+  const void *data = NULL;
+  const OpIOInfo *input_info = op_context_->input_info(idx);
+  const uint32_t op_def_idx = input_info->op_def_idx_;
+  if (kIdxConstTensor == op_def_idx) {
+    const model::ConstTensor *const_tensor =
+        engine_config_->net_def_->tensor(input_info->output_idx_);
+    data = engine_config_->model_data_ + const_tensor->offset();
+  } else if (kIdxModelInput == op_def_idx) {
+    data = engine_config_->input_buffers_[input_info->output_idx_];
+  } else {
+    const model::OperatorDef *pre_op_def =
+        engine_config_->net_def_->op(op_def_idx);
+    data = engine_config_->tensor_mem_ +
+        pre_op_def->mem_offset(input_info->output_idx_);
+  }
+  return data;
+}
+uint32_t Operator::GetInputShapeDimSize(uint32_t idx) {
+  uint32_t dim_size = 0;
+  const OpIOInfo *input_info = op_context_->input_info(idx);
+  const uint32_t op_def_idx = input_info->op_def_idx_;
+  if (kIdxConstTensor == op_def_idx) {
+    const model::ConstTensor *const_tensor =
+        engine_config_->net_def_->tensor(input_info->output_idx_);
+    dim_size = const_tensor->dim_size();
+  } else if (kIdxModelInput == op_def_idx) {
+    const model::InputOutputInfo *info =
+        engine_config_->net_def_->input_info(input_info->output_idx_);
+    dim_size = info->dim_size();
+  } else {
+    const model::OperatorDef *op_def = engine_config_->net_def_->op(op_def_idx);
+    const model::OutputShape *output_shape =
+        op_def->output_shape(input_info->output_idx_);
+    dim_size = output_shape->dim_size();
+  }
+  return dim_size;
+}
+const int32_t *Operator::GetInputShapeDims(uint32_t idx) {
+  const int32_t *dims = NULL;
+  const OpIOInfo *input_info = op_context_->input_info(idx);
+  const uint32_t op_def_idx = input_info->op_def_idx_;
+  if (kIdxConstTensor == op_def_idx) {
+    const model::ConstTensor *const_tensor =
+        engine_config_->net_def_->tensor(input_info->output_idx_);
+    dims = const_tensor->dim();
+  } else if (kIdxModelInput == op_def_idx) {
+    dims = engine_config_->input_shapes_[input_info->output_idx_];
+  } else {
+    const model::OperatorDef *op_def = engine_config_->net_def_->op(op_def_idx);
+    const model::OutputShape *output_shape =
+        op_def->output_shape(input_info->output_idx_);
+    dims = output_shape->dim();
+  }
+  return dims;
+}
+uint32_t Operator::GetOutputSize() {
+  return op_def_->output_size();
+}
+DataType Operator::GetOutputDataType(uint32_t idx) {
+  return op_def_->output_type(idx);
+}
+void *Operator::DoGetOutputData(uint32_t idx) {
+  return engine_config_->tensor_mem_ + op_def_->mem_offset(idx);
+}
+uint32_t Operator::GetOutputShapeDimSize(uint32_t idx) {
+  uint32_t dim_size = 0;
+  model::OutputShape *output_shape =
+      const_cast<model::OutputShape *>(op_context_->output_resize_shape(idx));
+  if (output_shape != NULL) {
+    dim_size = output_shape->dim_size();
+  }
+  return dim_size;
+}
+const int32_t *Operator::GetOutputShapeDims(uint32_t idx) {
+  const int32_t *dims = NULL;
+  model::OutputShape *output_shape =
+      const_cast<model::OutputShape *>(op_context_->output_resize_shape(idx));
+  if (output_shape != NULL) {
+    dims = output_shape->dim();
+  }
+  return dims;
+}
+MaceStatus Operator::ResizeOutputShape(uint32_t idx, uint32_t dim_size,
+                                       const int32_t *dims) {
+  model::OutputShape *output_shape =
+      const_cast<model::OutputShape *>(op_context_->output_resize_shape(idx));
+#ifndef NDEBUG
+  if (op_def_->output_shape(idx)->dim_size() < dim_size
+      || output_shape->dim_size() < dim_size) {
+    LOG(FATAL) << "Can not support dynamic dim_size. op_def_dim_size = "
+               << op_def_->output_shape(idx)->dim_size()
+               << ", output_shape_dim_size = " << output_shape->dim_size()
+               << ", dim_size = " << dim_size;
+  }
+  int32_t def_output_shape_size =
+      base::GetShapeSize(output_shape->dim_size(), output_shape->dim());
+  int32_t input_shape_size = base::GetShapeSize(dim_size, dims);
+  if (def_output_shape_size < input_shape_size) {
+    LOG(INFO) << op_def_->name() << " resize failed, because "
+              << def_output_shape_size << " < " << input_shape_size;
+    LOG(INFO) << "input: ";
+    for (uint32_t i = 0; i < dim_size; ++i) {
+      LOG(INFO) << dims[i] << ", ";
+    }
+    LOG(INFO) << "old output: ";
+    for (uint32_t i = 0; i < output_shape->dim_size(); ++i) {
+      LOG(INFO) << output_shape->dim(i) << ", ";
+    }
+    MACE_ASSERT(def_output_shape_size >= input_shape_size);
+  }
+#endif  // NDEBUG
+  if (dim_size > 0) {
+    base::memcpy(output_shape->mutable_dim(), dims, dim_size * sizeof(int32_t));
+  }
+  return MACE_SUCCESS;
+}
+#ifndef MACE_DEFINE_GET_ARG_BY_NAME_FUNC
+#define MACE_DEFINE_GET_ARG_BY_NAME_FUNC(T, FUNC)                   \
+template <>                                                         \
+T Operator::GetArgByName(const char *name, T default_value) const { \
+  const model::Argument *arg = GetArgByName(name);                  \
+  if (arg == NULL) {                                                \
+    return default_value;                                           \
+  } else {                                                          \
+    return arg->FUNC();                                             \
+  }                                                                 \
+}
+#endif  // MACE_DEFINE_GET_ARG_BY_NAME_FUNC
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(bool, i)
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(int32_t, i)
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(float, f)
+#ifndef MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC
+#define MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(T, FUNC)       \
+template <>                                                   \
+const T *Operator::GetRepeatArgByName(const char *name,       \
+                                      uint32_t *size) const { \
+  const model::Argument *arg = GetArgByName(name);            \
+  if (arg == NULL) {                                          \
+    return NULL;                                              \
+  }                                                           \
+  if (size != NULL) {                                         \
+    *size = arg->FUNC##_size();                               \
+  }                                                           \
+  return arg->FUNC();                                         \
+}
+#endif  // MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(int32_t, ints)
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(float, floats)
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(uint8_t, s)
+}  // namespace framework
+}  // namespace micro
--- a/micro/framework/operator.h
+++ b/micro/framework/operator.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_FRAMEWORK_OPERATOR_H_
+#define MICRO_FRAMEWORK_OPERATOR_H_
+#include "micro/base/logging.h"
+#include "micro/base/types.h"
+#include "micro/include/public/micro.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+struct MaceMicroEngineConfig;
+namespace model {
+class Argument;
+class OperatorDef;
+class OutputShape;
+}  // namespace model
+namespace ops {
+typedef framework::ScratchBuffer ScratchBuffer;
+}
+namespace framework {
+#ifndef MACE_OP_INPUT_TAGS
+#define MACE_OP_INPUT_TAGS(first_input, ...) \
+  enum _InputTags { first_input = 0, __VA_ARGS__ }
+#endif  // MACE_OP_INPUT_TAGS
+#ifndef MACE_OP_OUTPUT_TAGS
+#define MACE_OP_OUTPUT_TAGS(first_input, ...) \
+  enum _OutputTags { first_input = 0, __VA_ARGS__ }
+#endif  // MACE_OP_OUTPUT_TAGS
+class OpContext;
+class Operator {
+ public:
+  Operator() {}
+  // Note: This func should be virtual, but if we make it virtual,
+  // the operator delete will be needed, which is in c++ runtime library.
+  // For we don't use the Operator pointer to point sub-classes, the
+  // virtual ~Operator() is not needed.
+  ~Operator();
+  MaceStatus Init(MaceMicroEngineConfig *engine_config,
+                  OpContext *op_context,
+                  const model::OperatorDef *op_def);
+  virtual MaceStatus OnInit();
+  virtual MaceStatus Run();
+  template<typename T>
+  T GetArgByName(const char *name, T default_value) const;
+  template<typename T>
+  const T *GetRepeatArgByName(const char *name,
+                              uint32_t *size = NULL) const;
+ protected:
+  uint32_t GetInputSize();
+  const void *DoGetInputData(uint32_t idx);
+  uint32_t GetInputShapeDimSize(uint32_t idx);
+  const int32_t *GetInputShapeDims(uint32_t idx);
+  uint32_t GetOutputSize();
+  DataType GetOutputDataType(uint32_t idx);
+  void *DoGetOutputData(uint32_t idx);
+  uint32_t GetOutputShapeDimSize(uint32_t idx);
+  const int32_t *GetOutputShapeDims(uint32_t idx);
+  MaceStatus ResizeOutputShape(uint32_t idx, uint32_t input_dim_size,
+                               const int32_t *input_dims);
+  MaceStatus ReuseInputBufferForOutput(uint32_t output_idx, uint32_t input_idx);
+  template<typename T>
+  const T *GetInputData(uint32_t idx) {
+    return static_cast<const T *>(DoGetInputData(idx));
+  }
+  template<typename T>
+  T *GetOutputData(uint32_t idx) {
+    return static_cast<T *>(DoGetOutputData(idx));
+  }
+ private:
+  const model::Argument *GetArgByName(const char *name) const;
+ protected:
+  const model::OperatorDef *op_def_;
+  MaceMicroEngineConfig *engine_config_;
+ private:
+  OpContext *op_context_;
+};
+}  // namespace framework
+}  // namespace micro
+#endif  // MICRO_FRAMEWORK_OPERATOR_H_
--- a/micro/framework/scratch_buffer.cc
+++ b/micro/framework/scratch_buffer.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/framework/scratch_buffer.h"
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace framework {
+#ifndef NDEBUG
+namespace {
+int64_t kDetectHandle = -1;
+}
+#endif
+ScratchBuffer::ScratchBuffer(MaceMicroEngineConfig *engine_config) :
+    engine_config_(engine_config), offset_(0) {
+#ifndef NDEBUG
+  int64_t cur_handle = reinterpret_cast<int64_t>(engine_config);
+  MACE_ASSERT1(cur_handle != kDetectHandle, "Detect scratch buffer error.");
+  kDetectHandle = cur_handle;
+#endif
+}
+ScratchBuffer::~ScratchBuffer() {
+#ifndef NDEBUG
+  kDetectHandle = -1;
+#endif
+}
+void *ScratchBuffer::DoGetBuffer(uint32_t size) {
+  if (size % 4 != 0) {
+    size = (size + 3) / 4 * 4;
+  }
+  if (offset_ + size > engine_config_->scratch_buffer_size_) {
+    LOG(FATAL) << "The scratch buffer is not enough."
+               << "offset_: " << offset_ << ", size: " << size
+               << ", engine_config_->scratch_buffer_size_: "
+               << engine_config_->scratch_buffer_size_;
+  }
+  void *ptr = engine_config_->scratch_buffer_ + offset_;
+  offset_ += size;
+  return ptr;
+}
+}  // namespace framework
+}  // namespace micro
--- a/micro/framework/scratch_buffer.h
+++ b/micro/framework/scratch_buffer.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_FRAMEWORK_SCRATCH_BUFFER_H_
+#define MICRO_FRAMEWORK_SCRATCH_BUFFER_H_
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace framework {
+class ScratchBuffer {
+ public:
+  explicit ScratchBuffer(MaceMicroEngineConfig *engine_config);
+  ~ScratchBuffer();
+  template<typename T>
+  T *GetBuffer(int32_t size) {
+    MACE_ASSERT(size > 0);
+    return static_cast<T *>(
+        DoGetBuffer(static_cast<uint32_t>(size) * sizeof(T)));
+  }
+  template<typename T>
+  T *GetBuffer(uint32_t size) {
+    return static_cast<T *>(DoGetBuffer(size * sizeof(T)));
+  }
+ private:
+  void *DoGetBuffer(uint32_t size);
+ private:
+  const MaceMicroEngineConfig *engine_config_;
+  uint32_t offset_;
+};
+}  // namespace framework
+}  // namespace micro
+#endif  // MICRO_FRAMEWORK_SCRATCH_BUFFER_H_
--- a/micro/include/BUILD.bazel
+++ b/micro/include/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "include",
+    hdrs = glob([
+        "public/*.h",
+        "port/*.h",
+        "utils/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+)
+cc_library(
+    name = "public_headers",
+    hdrs = glob([
+        "public/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+)
--- a/micro/include/port/define.h
+++ b/micro/include/port/define.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_INCLUDE_PORT_DEFINE_H_
+#define MICRO_INCLUDE_PORT_DEFINE_H_
+#define MACE_API
+#define MACE_DEPRECATED
+#ifndef __FILE__
+#define __FILE__ ""
+#endif
+#ifndef __LINE__
+#define __LINE__ 0
+#endif
+#ifndef NULL
+#define NULL 0
+#endif
+#endif  // MICRO_INCLUDE_PORT_DEFINE_H_
--- a/micro/include/public/micro.h
+++ b/micro/include/public/micro.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_INCLUDE_PUBLIC_MICRO_H_
+#define MICRO_INCLUDE_PUBLIC_MICRO_H_
+#include <stdint.h>
+#include "micro/include/port/define.h"
+namespace micro {
+enum DataFormat {
+  NONE = 0, NHWC = 1, NCHW = 2,
+  HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103,
+  AUTO = 1000,
+};
+enum PerfHint {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+};
+enum DataType {
+  DT_INVALID = 0,
+  DT_FLOAT = 1,
+  DT_UINT8 = 2,
+  DT_HALF = 3,
+  DT_INT32 = 4,
+  DT_FLOAT16 = 5,
+  DT_BFLOAT16 = 6,
+};
+enum MaceStatus {
+  MACE_SUCCESS = 0,
+  MACE_INVALID_ARGS = 1,
+  MACE_OUT_OF_RESOURCES = 2,
+  MACE_UNSUPPORTED = 3,
+  MACE_RUNTIME_ERROR = 4,
+};
+namespace model {
+class NetDef;
+}  // namespace model
+namespace framework {
+class Graph;
+class Operator;
+}  // namespace framework
+struct MACE_API MaceMicroEngineConfig {
+  model::NetDef *net_def_;
+  const uint8_t *model_data_;
+  framework::Graph *graph_;
+  framework::Operator **op_array_;
+  uint8_t *tensor_mem_;
+  const void **input_buffers_;
+  const int32_t **input_shapes_;
+  uint8_t *scratch_buffer_;
+  uint32_t scratch_buffer_size_;
+};
+class MACE_API MaceMicroEngine {
+ public:
+  MaceMicroEngine() {}
+  ~MaceMicroEngine() {}
+  MaceStatus Init(MaceMicroEngineConfig *engine_config);
+  MaceStatus RegisterInputData(uint32_t idx, const void *input_buffer,
+                               const int32_t *input_dims);
+  MaceStatus Run();
+  MaceStatus GetOutputData(const uint32_t idx, void **output_data,
+                           const int32_t **output_dims,
+                           uint32_t *output_dim_size);
+  MaceStatus GetOpOutputData(const uint32_t op_def_idx,
+                             const uint32_t output_idx,
+                             void **output_data,
+                             const int32_t **output_dims,
+                             uint32_t *output_dim_size);
+ private:
+  MaceMicroEngineConfig *engine_config_;
+  MaceMicroEngine(const MaceMicroEngine &);
+  MaceMicroEngine &operator=(const MaceMicroEngine &);
+};
+}  // namespace micro
+#endif  // MICRO_INCLUDE_PUBLIC_MICRO_H_
--- a/micro/include/utils/bfloat16.h
+++ b/micro/include/utils/bfloat16.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_INCLUDE_UTILS_BFLOAT16_H_
+#define MICRO_INCLUDE_UTILS_BFLOAT16_H_
+#include <stdint.h>
+#ifdef MACE_ENABLE_BFLOAT16
+namespace micro {
+union Sphinx {
+  uint32_t i;
+  float f;
+  Sphinx(uint32_t value) : i(value) {}
+  Sphinx(float value) : f(value) {}
+};
+class BFloat16 {
+ public:
+  BFloat16();
+  operator float() const {
+    return Sphinx(static_cast<uint32_t>(data_ << 16)).f;
+  }
+  void operator=(const BFloat16 &value) {
+    data_ = value.data_;
+  }
+  void operator=(float value) {
+    data_ = Sphinx(value).i >> 16;
+  }
+ public:
+  uint16_t data_;
+};
+}  // namespace micro
+#endif  // MACE_ENABLE_BFLOAT16
+#endif  // MICRO_INCLUDE_UTILS_BFLOAT16_H_
--- a/micro/include/utils/macros.h
+++ b/micro/include/utils/macros.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_INCLUDE_UTILS_MACROS_H_
+#define MICRO_INCLUDE_UTILS_MACROS_H_
+#include "micro/include/public/micro.h"
+namespace micro {
+#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR
+#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \
+ public:                                         \
+  virtual ~CLASSNAME() {}
+#endif  // MACE_EMPTY_VIRTUAL_DESTRUCTOR
+#define MACE_UNUSED(var) (void)(var)
+}  // namespace micro
+#endif  // MICRO_INCLUDE_UTILS_MACROS_H_
--- a/micro/micro.bzl
+++ b/micro/micro.bzl
+def if_hexagon_enabled(a):
+    return select({
+        "//micro:hexagon_enabled": a,
+        "//conditions:default": [],
+    })
+def if_not_hexagon_enabled(a):
+    return select({
+        "//micro:hexagon_enabled": [],
+        "//conditions:default": a,
+    })
+def new_local_repository_env_impl(repository_ctx):
+    echo_cmd = "echo " + repository_ctx.attr.path
+    echo_result = repository_ctx.execute(["bash", "-c", echo_cmd])
+    src_path_str = echo_result.stdout.splitlines()[0]
+    source_path = repository_ctx.path(src_path_str)
+    work_path = repository_ctx.path(".")
+    child_list = source_path.readdir()
+    for child in child_list:
+        child_name = child.basename
+        repository_ctx.symlink(child, work_path.get_child(child_name))
+    build_file_babel = Label("//:" + repository_ctx.attr.build_file)
+    build_file_path = repository_ctx.path(build_file_babel)
+    repository_ctx.symlink(build_file_path, work_path.get_child("BUILD"))
+# a new_local_repository support environment variable
+new_local_repository_env = repository_rule(
+    implementation = new_local_repository_env_impl,
+    local = True,
+    attrs = {
+        "path": attr.string(mandatory = True),
+        "build_file": attr.string(mandatory = True),
+    },
+)
--- a/micro/model/BUILD.bazel
+++ b/micro/model/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "model",
+    srcs = glob(["*.cc"]),
+    hdrs = glob(["*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/base",
+        "//micro/include",
+    ],
+)
--- a/micro/model/argument.cc
+++ b/micro/model/argument.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/argument.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_STRING_FUNC(Argument, name, name_)
+MACE_DEFINE_OBJECT_FUNC(Argument, float, f)
+MACE_DEFINE_OBJECT_FUNC(Argument, int32_t, i)
+MACE_DEFINE_BYTES_FUNC(Argument, s, s_)
+MACE_DEFINE_ARRAY_FUNC(Argument, float, floats, floats_)
+MACE_DEFINE_ARRAY_BASE_PTR_FUNC(Argument, float, floats, floats_)
+MACE_DEFINE_ARRAY_FUNC(Argument, int32_t, ints, ints_)
+MACE_DEFINE_ARRAY_BASE_PTR_FUNC(Argument, int32_t, ints, ints_)
+}  // namespace model
+}  // namespace micro
--- a/micro/model/argument.h
+++ b/micro/model/argument.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_ARGUMENT_H_
+#define MICRO_MODEL_ARGUMENT_H_
+#include "micro/base/serialize.h"
+namespace micro {
+namespace model {
+class Argument : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(Argument)
+  MACE_DECLARE_STRING_FUNC(name);
+  MACE_DECLARE_OBJECT_FUNC(float, f);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, i);
+  MACE_DECLARE_BYTES_FUNC(s);
+  MACE_DECLARE_ARRAY_FUNC(float, floats);
+  MACE_DECLARE_ARRAY_BASE_PTR_FUNC(float, floats);
+  MACE_DECLARE_ARRAY_FUNC(int32_t, ints);
+  MACE_DECLARE_ARRAY_BASE_PTR_FUNC(int32_t, ints);
+ private:
+  SerialString name_;
+  SerialFloat f_;
+  SerialInt32 i_;
+  SerialBytes s_;
+  SerialArray<SerialFloat> floats_;
+  SerialArray<SerialInt32> ints_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_ARGUMENT_H_
--- a/micro/model/const_tensor.cc
+++ b/micro/model/const_tensor.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/const_tensor.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_ARRAY_FUNC(ConstTensor, int32_t, dim, dims_)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, DataType, data_type)
+MACE_DEFINE_ARRAY_FUNC(ConstTensor, float, float_data, float_datas_)
+MACE_DEFINE_ARRAY_FUNC(ConstTensor, int32_t, int32_data, int32_datas_)
+MACE_DEFINE_STRING_FUNC(ConstTensor, name, name_)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, int32_t, offset)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, int32_t, data_size)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, float, scale)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, int32_t, zero_point)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, float, minval)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, float, maxval)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, bool, quantized)
+MACE_DEFINE_OBJECT_FUNC(ConstTensor, uint32_t, node_id)
+const int32_t *ConstTensor::dim() const {
+  const int32_t *array = reinterpret_cast<const int32_t *>(
+      reinterpret_cast<const uint8_t *>(this) + dims_.offset_);
+  return array;
+}
+}  // namespace model
+}  // namespace micro
--- a/micro/model/const_tensor.h
+++ b/micro/model/const_tensor.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_CONST_TENSOR_H_
+#define MICRO_MODEL_CONST_TENSOR_H_
+#include "micro/base/serialize.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace model {
+class ConstTensor : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(ConstTensor)
+  MACE_DECLARE_ARRAY_FUNC(int32_t, dim);
+  MACE_DECLARE_OBJECT_FUNC(DataType, data_type);
+  MACE_DECLARE_ARRAY_FUNC(float, float_data);
+  MACE_DECLARE_ARRAY_FUNC(int32_t, int32_data);
+  MACE_DECLARE_STRING_FUNC(name);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, offset);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, data_size);
+  MACE_DECLARE_OBJECT_FUNC(float, scale);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, zero_point);
+  MACE_DECLARE_OBJECT_FUNC(float, minval);
+  MACE_DECLARE_OBJECT_FUNC(float, maxval);
+  MACE_DECLARE_OBJECT_FUNC(bool, quantized);
+  MACE_DECLARE_OBJECT_FUNC(uint32_t, node_id);
+  const int32_t *dim() const;
+ private:
+  SerialArray<SerialInt32> dims_;
+  DataType data_type_;
+  SerialArray<SerialFloat> float_datas_;
+  SerialArray<SerialInt32> int32_datas_;
+  SerialString name_;
+  SerialInt32 offset_;
+  SerialInt32 data_size_;
+  SerialFloat scale_;
+  SerialInt32 zero_point_;
+  SerialFloat minval_;
+  SerialFloat maxval_;
+  SerialBool quantized_;
+  SerialUint32 node_id_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_CONST_TENSOR_H_
--- a/micro/model/input_output_info.cc
+++ b/micro/model/input_output_info.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/input_output_info.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_STRING_FUNC(InputOutputInfo, name, name_)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, int32_t, node_id)
+MACE_DEFINE_ARRAY_FUNC(InputOutputInfo, int32_t, dim, dims_)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, int32_t, max_byte_size)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, int32_t, data_type)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, int32_t, data_format)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, float, scale)
+MACE_DEFINE_OBJECT_FUNC(InputOutputInfo, int32_t, zero_point)
+}  // namespace model
+}  // namespace micro
--- a/micro/model/input_output_info.h
+++ b/micro/model/input_output_info.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_INPUT_OUTPUT_INFO_H_
+#define MICRO_MODEL_INPUT_OUTPUT_INFO_H_
+#include "micro/base/serialize.h"
+namespace micro {
+namespace model {
+class InputOutputInfo : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(InputOutputInfo)
+  MACE_DECLARE_STRING_FUNC(name);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, node_id);
+  MACE_DECLARE_ARRAY_FUNC(int32_t, dim);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, max_byte_size);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, data_type);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, data_format);
+  MACE_DECLARE_OBJECT_FUNC(float, scale);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, zero_point);
+ private:
+  SerialString name_;
+  SerialInt32 node_id_;
+  SerialArray<SerialInt32> dims_;
+  SerialInt32 max_byte_size_;
+  SerialInt32 data_type_;
+  SerialInt32 data_format_;
+  SerialFloat scale_;
+  SerialInt32 zero_point_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_INPUT_OUTPUT_INFO_H_
--- a/micro/model/net_def.cc
+++ b/micro/model/net_def.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/net_def.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_PTR_ARRAY_FUNC(NetDef, OperatorDef, op, ops_)
+MACE_DEFINE_PTR_ARRAY_FUNC(NetDef, Argument, arg, args_)
+MACE_DEFINE_PTR_ARRAY_FUNC(NetDef, ConstTensor, tensor, tensors_)
+MACE_DEFINE_OBJECT_FUNC(NetDef, int32_t, data_type)
+MACE_DEFINE_PTR_ARRAY_FUNC(NetDef, InputOutputInfo, input_info, input_infos_)
+MACE_DEFINE_PTR_ARRAY_FUNC(NetDef, InputOutputInfo, output_info, output_infos_)
+}  // namespace model
+}  // namespace micro
--- a/micro/model/net_def.h
+++ b/micro/model/net_def.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_NET_DEF_H_
+#define MICRO_MODEL_NET_DEF_H_
+#include "micro/base/serialize.h"
+#include "micro/model/argument.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/input_output_info.h"
+#include "micro/model/operator_def.h"
+namespace micro {
+namespace model {
+class NetDef : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(NetDef)
+  MACE_DECLARE_PTR_ARRAY_FUNC(OperatorDef, op);
+  MACE_DECLARE_PTR_ARRAY_FUNC(Argument, arg);
+  MACE_DECLARE_PTR_ARRAY_FUNC(ConstTensor, tensor);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, data_type);
+  MACE_DECLARE_PTR_ARRAY_FUNC(InputOutputInfo, input_info);
+  MACE_DECLARE_PTR_ARRAY_FUNC(InputOutputInfo, output_info);
+ private:
+  SerialArray<OperatorDef> ops_;
+  SerialArray<Argument> args_;
+  SerialArray<ConstTensor> tensors_;
+  SerialInt32 data_type_;
+  SerialArray<InputOutputInfo> input_infos_;
+  SerialArray<InputOutputInfo> output_infos_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_NET_DEF_H_
--- a/micro/model/operator_def.cc
+++ b/micro/model/operator_def.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/operator_def.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_STRING_ARRAY_FUNC(OperatorDef, input, inputs_)
+MACE_DEFINE_STRING_ARRAY_FUNC(OperatorDef, output, outputs_)
+MACE_DEFINE_STRING_FUNC(OperatorDef, name, name_)
+MACE_DEFINE_STRING_FUNC(OperatorDef, type, type_)
+MACE_DEFINE_OBJECT_FUNC(OperatorDef, int32_t, device_type)
+MACE_DEFINE_PTR_ARRAY_FUNC(OperatorDef, Argument, arg, args_)
+MACE_DEFINE_PTR_ARRAY_FUNC(OperatorDef, OutputShape,
+                           output_shape, output_shapes_)
+MACE_DEFINE_ARRAY_FUNC(OperatorDef, DataType, output_type, output_types_)
+// the mem_offset is the mem_id in proto file
+MACE_DEFINE_ARRAY_FUNC(OperatorDef, int32_t, mem_offset, mem_offsets_)
+}  // namespace model
+}  // namespace micro
--- a/micro/model/operator_def.h
+++ b/micro/model/operator_def.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_OPERATOR_DEF_H_
+#define MICRO_MODEL_OPERATOR_DEF_H_
+#include "micro/base/serialize.h"
+#include "micro/include/public/micro.h"
+#include "micro/model/argument.h"
+#include "micro/model/output_shape.h"
+namespace micro {
+namespace model {
+class OperatorDef : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(OperatorDef)
+  MACE_DECLARE_STRING_ARRAY_FUNC(input);
+  MACE_DECLARE_STRING_ARRAY_FUNC(output);
+  MACE_DECLARE_STRING_FUNC(name);
+  MACE_DECLARE_STRING_FUNC(type);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, device_type);
+  MACE_DECLARE_PTR_ARRAY_FUNC(Argument, arg);
+  MACE_DECLARE_PTR_ARRAY_FUNC(OutputShape, output_shape);
+  MACE_DECLARE_ARRAY_FUNC(DataType, output_type);
+  // the mem_offset is the mem_id in proto file
+  MACE_DECLARE_ARRAY_FUNC(int32_t, mem_offset);
+ private:
+  SerialArray<SerialString> inputs_;
+  SerialArray<SerialString> outputs_;
+  SerialString name_;
+  SerialString type_;
+  // device_type_ is not used currently, for future;
+  SerialInt32 device_type_;
+  SerialArray<Argument> args_;
+  SerialArray<OutputShape> output_shapes_;
+  SerialArray<DataType> output_types_;
+  SerialArray<SerialInt32> mem_offsets_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_OPERATOR_DEF_H_
--- a/micro/model/output_shape.cc
+++ b/micro/model/output_shape.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/model/output_shape.h"
+namespace micro {
+namespace model {
+MACE_DEFINE_ARRAY_FUNC(OutputShape, int32_t, dim, dims_)
+const int32_t *OutputShape::dim() const {
+  const int32_t *array = reinterpret_cast<const int32_t *>(
+      reinterpret_cast<const char *>(this) + dims_.offset_);
+  return array;
+}
+int32_t *OutputShape::mutable_dim() {
+  char *base_addr = reinterpret_cast<char *>(const_cast<OutputShape *>(this));
+  int32_t *array = reinterpret_cast<int32_t *>(base_addr + dims_.offset_);
+  return array;
+}
+}  // namespace model
+}  // namespace micro
--- a/micro/model/output_shape.h
+++ b/micro/model/output_shape.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_MODEL_OUTPUT_SHAPE_H_
+#define MICRO_MODEL_OUTPUT_SHAPE_H_
+#include "micro/base/serialize.h"
+namespace micro {
+namespace model {
+class OutputShape : public Serialize {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(OutputShape)
+  MACE_DECLARE_ARRAY_FUNC(int32_t, dim);
+  const int32_t *dim() const;
+  int32_t *mutable_dim();
+ private:
+  SerialArray<SerialInt32> dims_;
+};
+}  // namespace model
+}  // namespace micro
+#endif  // MICRO_MODEL_OUTPUT_SHAPE_H_
--- a/micro/ops/BUILD.bazel
+++ b/micro/ops/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "ops",
+    srcs = glob(["**/*.cc"]),
+    hdrs = glob(["**/*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/base",
+        "//micro/framework",
+    ],
+)
+cc_library(
+    name = "ops_for_test",
+    srcs = glob(["**/*.cc"]),
+    hdrs = glob(["**/*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "//micro/base",
+        "//micro/framework:framework_for_optest",
+    ],
+    alwayslink = 1,
+)
--- a/micro/ops/activation.cc
+++ b/micro/ops/activation.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/activation.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/model/argument.h"
+namespace micro {
+namespace ops {
+namespace {
+template<typename T>
+void PReLUActivation(const T *input_ptr, const int32_t outer_size,
+                     const int32_t channel, const T *alpha_ptr,
+                     T *output_ptr) {
+  for (int32_t i = 0; i < outer_size; ++i) {
+    const int32_t outer_base = i * channel;
+    for (int32_t c = 0; c < channel; ++c) {
+      const int32_t idx = outer_base + c;
+      if (input_ptr[idx] < 0) {
+        output_ptr[idx] = input_ptr[idx] * alpha_ptr[c];
+      } else {
+        output_ptr[idx] = input_ptr[idx];
+      }
+    }
+  }
+}
+}  // namespace
+MaceStatus ActivationOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  return activation_.Init(this);
+}
+MaceStatus ActivationOp::Run() {
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+  if (activation_.GetActivationType() == PRELU) {
+    MACE_ASSERT(GetInputSize() > 1);
+    const mifloat *alpha = GetInputData<mifloat>(ALPHA);
+    const int32_t outer_size =
+        base::accumulate_multi(input_dims_, 0, input_dim_size_ - 1);
+    const int32_t channel = input_dims_[input_dim_size_ - 1];
+    PReLUActivation(input_, outer_size, channel, alpha, output_);
+    return MACE_SUCCESS;
+  } else {
+    const int32_t input_size = base::GetShapeSize(input_dim_size_, input_dims_);
+    return activation_.Compute(input_, input_size, output_);
+  }
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/activation.h
+++ b/micro/ops/activation.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_ACTIVATION_H_
+#define MICRO_OPS_ACTIVATION_H_
+#include "micro/framework/operator.h"
+#include "micro/ops/utils/activation.h"
+namespace micro {
+namespace ops {
+class ActivationOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  mifloat *output_;
+  Activation activation_;
+  MACE_OP_INPUT_TAGS(INPUT, ALPHA);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_ACTIVATION_H_
--- a/micro/ops/argmax.h
+++ b/micro/ops/argmax.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_ARGMAX_H_
+#define MICRO_OPS_ARGMAX_H_
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/include/utils/macros.h"
+namespace micro {
+namespace ops {
+template<class T>
+class ArgMaxOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() {
+    axis_ = GetArgByName("axis", static_cast<int32_t>(0));
+    keep_dims_ = GetArgByName("keepdims", true);
+    MACE_ASSERT1(keep_dims_, "Mace only supports keep_dims ArgMax.");
+    argmin_ = GetArgByName("argmin", false);
+    input_ = GetInputData<T>(INPUT);
+    input_dims_ = GetInputShapeDims(INPUT);
+    input_dim_size_ = GetInputShapeDimSize(INPUT);
+    MACE_ASSERT1(input_dim_size_ > 0, "ArgMax input should not be a scalar");
+    output_ = GetOutputData<int32_t>(OUTPUT);
+    output_dims_ = GetOutputShapeDims(OUTPUT);
+    output_dim_size_ = GetOutputShapeDimSize(OUTPUT);
+    return MACE_SUCCESS;
+  }
+  MaceStatus Run() {
+    int32_t axis_value = 0;
+    const int32_t *axis = GetInputSize() == 2 ?
+                          GetInputData<int32_t>(AXIS) : NULL;
+    if (axis != NULL) {
+      MACE_ASSERT1(GetInputShapeDimSize(AXIS) == 0,
+                   "Mace argmax only supports scalar axis");
+      axis_value = axis[0];
+    } else {
+      axis_value = axis_;
+    }
+    if (axis_value < 0) {
+      axis_value += input_dim_size_;
+    }
+    MACE_ASSERT1(axis_value == static_cast<int32_t>(input_dim_size_) - 1,
+                 "Mace argmax only supports last dimension as axis");
+    MACE_ASSERT1(output_dim_size_ >= input_dim_size_ - 1,
+                 "Convert model error.");
+    int32_t *output_dims =
+        ScratchBuffer(engine_config_).GetBuffer<int32_t>(output_dim_size_);
+    for (int32_t d = 0; d < static_cast<int32_t>(output_dim_size_); ++d) {
+      output_dims[d] = input_dims_[d < axis_value ? d : d + 1];
+    }
+    ResizeOutputShape(OUTPUT, output_dim_size_, output_dims);
+    int32_t outer_size = base::GetShapeSize(output_dim_size_, output_dims_);
+    int32_t inner_size = input_dims_[axis_value];
+    if (argmin_) {
+      for (int32_t i = 0; i < outer_size; ++i) {
+        int32_t idx = 0;
+        T min_value = base::highest();
+        const T *input_ptr = input_ + i * inner_size;
+        for (int32_t j = 0; j < inner_size; ++j) {
+          float input = input_ptr[j];
+          if (input < min_value) {
+            min_value = input;
+            idx = j;
+          }
+        }
+        output_[i] = idx;
+      }
+    } else {
+      for (int32_t i = 0; i < outer_size; ++i) {
+        int32_t idx = 0;
+        T max_value = base::lowest();
+        const T *input_ptr = input_ + i * inner_size;
+        for (int32_t j = 0; j < inner_size; ++j) {
+          float input = input_ptr[j];
+          if (input > max_value) {
+            max_value = input;
+            idx = j;
+          }
+        }
+        output_[i] = idx;
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+ private:
+  int32_t axis_;
+  bool keep_dims_;
+  bool argmin_;
+  const T *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  int32_t *output_;
+  const int32_t *output_dims_;
+  uint32_t output_dim_size_;
+  MACE_OP_INPUT_TAGS(INPUT, AXIS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_ARGMAX_H_
--- a/micro/ops/bias_add.cc
+++ b/micro/ops/bias_add.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/bias_add.h"
+#include "micro/base/logging.h"
+#include "micro/ops/utils/crumb_utils.h"
+namespace micro {
+namespace ops {
+MaceStatus BiasAddOp::OnInit() {
+  MACE_ASSERT1(static_cast<DataFormat>(
+                   GetArgByName("data_format", static_cast<int32_t>(NHWC)))
+                   != NCHW, "Now only support NHWC");
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  bias_ = GetInputData<mifloat>(BIAS);
+  bias_dims_ = GetInputShapeDims(BIAS);
+  bias_dim_size_ = GetInputShapeDimSize(BIAS);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  MACE_ASSERT1(bias_dim_size_ == 1, "Bias dim must be 1.");
+  MACE_ASSERT1(bias_dims_[0] == input_dims_[input_dim_size_ - 1],
+               "The bias's channel dim should be equal to the input's");
+  return ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_);
+}
+MaceStatus BiasAddOp::Run() {
+  return crumb::ComputeBias(input_, input_dims_, input_dim_size_,
+                            bias_, bias_dims_[0], output_);
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/bias_add.h
+++ b/micro/ops/bias_add.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_BIAS_ADD_H_
+#define MICRO_OPS_BIAS_ADD_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class BiasAddOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  const mifloat *bias_;
+  const int32_t *bias_dims_;
+  uint32_t bias_dim_size_;
+  mifloat *output_;
+  MACE_OP_INPUT_TAGS(INPUT, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_BIAS_ADD_H_
--- a/micro/ops/cast.h
+++ b/micro/ops/cast.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_CAST_H_
+#define MICRO_OPS_CAST_H_
+#include "micro/base/utils.h"
+#include "micro/base/types.h"
+#include "micro/framework/operator.h"
+#include "micro/include/utils/bfloat16.h"
+namespace micro {
+namespace ops {
+#ifndef MACE_CAST_OP_CAST_TENSOR
+#define MACE_CAST_OP_CAST_TENSOR(SrcType, DstType)           \
+const SrcType *input = static_cast<const SrcType *>(input_); \
+DstType *output = static_cast<DstType *>(output_);           \
+for (int32_t i = 0; i < tensor_size_; ++i) {                 \
+  output[i] = input[i];                                      \
+}
+#endif  // MACE_CAST_OP_CAST_TENSOR
+class CastOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() {
+    input_ = GetInputData<void>(INPUT);
+    input_dt_ = static_cast<DataType>(
+        GetArgByName("T", static_cast<int32_t >(DT_FLOAT)));
+    const int32_t *input_dims = GetInputShapeDims(INPUT);
+    const uint32_t input_dim_size_ = GetInputShapeDimSize(INPUT);
+    tensor_size_ = base::GetShapeSize(input_dim_size_, input_dims);
+    MACE_ASSERT(tensor_size_ > 0);
+    output_ = GetOutputData<void>(OUTPUT);
+    output_dt_ = GetOutputDataType(OUTPUT);
+    return MACE_SUCCESS;
+  }
+  MaceStatus Run() {
+    if (input_dt_ == DT_FLOAT && output_dt_ == DT_BFLOAT16) {
+#ifdef MACE_ENABLE_BFLOAT16
+      MACE_CAST_OP_CAST_TENSOR(float, BFloat16)
+#else
+      MACE_NOT_IMPLEMENTED;
+#endif
+    } else if (input_dt_ == DT_BFLOAT16 && output_dt_ == DT_FLOAT) {
+#ifdef MACE_ENABLE_BFLOAT16
+      MACE_CAST_OP_CAST_TENSOR(BFloat16, float)
+#else
+      MACE_NOT_IMPLEMENTED;
+#endif
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+    return MACE_SUCCESS;
+  }
+ private:
+  const void *input_;
+  DataType input_dt_;
+  int32_t tensor_size_;
+  void *output_;
+  DataType output_dt_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_CAST_H_
--- a/micro/ops/eltwise.cc
+++ b/micro/ops/eltwise.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/eltwise.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+namespace eltwise {
+bool ShapeIsEqual(const int32_t *dims0,
+                  const int32_t *dims1, uint32_t dim_size) {
+  while (--dim_size > 0) {
+    if (dims0[dim_size] != dims1[dim_size])
+      return false;
+  }
+  return true;
+}
+int32_t GetIndex(const int32_t *shape,
+                 const int32_t *index, int32_t dim_size) {
+  int32_t idx = 0;
+  for (int32_t i = 0; i < dim_size; ++i) {
+    if (shape[i] > 1) {
+      idx = idx * shape[i] + index[i];
+    }
+  }
+  return idx;
+}
+void IncreaseIndex(const int32_t *shape, int32_t **index, int32_t dim_size) {
+  for (int32_t i = dim_size - 1; i >= 0; --i) {
+    ++(*index)[i];
+    if ((*index)[i] >= shape[i]) {
+      (*index)[i] -= shape[i];
+    } else {
+      break;
+    }
+  }
+}
+}  // namespace eltwise
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/eltwise.h
+++ b/micro/ops/eltwise.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_ELTWISE_H_
+#define MICRO_OPS_ELTWISE_H_
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+namespace eltwise {  // for redefine
+enum Type {
+  SUM = 0,
+  SUB = 1,
+  PROD = 2,
+  DIV = 3,
+  MIN = 4,
+  MAX = 5,
+  NEG = 6,
+  ABS = 7,
+  SQR_DIFF = 8,
+  POW = 9,
+  EQUAL = 10,
+  FLOOR_DIV = 11,
+  CLIP = 12,
+  SIGN = 13,
+  NONE = 14,
+};
+bool ShapeIsEqual(const int32_t *dims0,
+                  const int32_t *dims1, uint32_t dim_size);
+int32_t GetIndex(const int32_t *shape, const int32_t *index, int32_t dim_size);
+void IncreaseIndex(const int32_t *shape, int32_t **index, int32_t dim_size);
+template<typename T>
+int32_t Sign(T val) {
+  return (T(0) < val) - (val < T(0));
+}
+}  // namespace eltwise
+template<typename T>
+class EltwiseOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() {
+    input0_ = GetInputData<T>(INPUT0);
+    input0_dims_ = GetInputShapeDims(INPUT0);
+    input0_dim_size_ = GetInputShapeDimSize(INPUT0);
+    if (GetInputSize() >= 2) {
+      input1_ = GetInputData<T>(INPUT1);
+      input1_dims_ = GetInputShapeDims(INPUT1);
+      input1_dim_size_ = GetInputShapeDimSize(INPUT1);
+    } else {
+      input1_ = NULL;
+      input1_dims_ = NULL;
+      input1_dim_size_ = 0;
+    }
+    output_ = GetOutputData<T>(OUTPUT);
+    type_ = static_cast<eltwise::Type>(GetArgByName(
+        "type", static_cast<int32_t>(NONE)));
+    coeff_ = GetRepeatArgByName<float>("coeff", &coeff_size_);
+    scalar_input_ = GetArgByName("scalar_input", 1.0f);
+    scalar_input_index_ = GetArgByName("scalar_input_index",
+                                       static_cast<int32_t>(1));
+    DataFormat data_format = static_cast<DataFormat>(
+        GetArgByName("data_format", static_cast<int32_t>(NHWC)));
+    nchw_ = (data_format == NCHW);
+    return MACE_SUCCESS;
+  }
+  MaceStatus Run() {
+    MACE_ASSERT1(GetInputSize() < 3,
+                 "Element-Wise does not support 3 or higher inputs,"
+                 " you could change your model to multiple Element-Wise");
+    if (input1_ == NULL) {
+      input1_ = &scalar_input_;
+      input1_dim_size_ = 1;
+      input1_dims_ = static_cast<const int32_t *>(
+          reinterpret_cast<int32_t *>(&input1_dim_size_));  // a trick
+    }
+    if (type_ == eltwise::CLIP) {
+      MACE_ASSERT1(coeff_size_ == 2 && coeff_[0] < coeff_[1],
+                   "Clip's min/max values are not correct.");
+    }
+    if (type_ == eltwise::EQUAL) {  // IsLogicalType
+      // as we do not have bool-type tensor, we use int type
+      return DoEltwise<int32_t>();
+    } else {
+      return DoEltwise<T>();
+    }
+  }
+ private:
+  template<typename DstType>
+  MaceStatus DoEltwise() {
+    int32_t input0_size = base::GetShapeSize(input0_dim_size_, input0_dims_);
+    int32_t input1_size = input1_dim_size_ == 0 ?
+                          0 : base::GetShapeSize(input1_dim_size_,
+                                                 input1_dims_);
+    bool swapped = false;
+    if (input0_dim_size_ < input1_dim_size_
+        || (input0_dim_size_ == input1_dim_size_
+            && input0_size < input1_size)) {
+      base::swap(&input0_, &input1_);
+      base::swap(&input0_dims_, &input1_dims_);
+      base::swap(&input0_dim_size_, &input1_dim_size_);
+      base::swap(&input0_size, &input1_size);
+      swapped = true;
+    }
+    if (scalar_input_index_ == 0) {
+      swapped = !swapped;
+    }
+    // check if we can broadcast tensor
+    uint32_t rank_diff =
+        static_cast<uint32_t>(input0_dim_size_ - input1_dim_size_);
+    if (nchw_) {
+      MACE_ASSERT1((input0_dim_size_ == 4) &&
+          ((input1_dim_size_ == 0) ||
+              (input1_dim_size_ == 4 && input1_dims_[1] == input0_dims_[1] &&
+                  (input1_dims_[0] == input0_dims_[0] ||
+                      input1_dims_[0] == 1)) ||
+              (input1_dim_size_ == 1 && input1_dims_[0] == input0_dims_[1])),
+                   "only support broadcast channel dimension");
+    } else {
+      for (uint32_t i = 0; i < input1_dim_size_; ++i) {
+        MACE_ASSERT1(input0_dims_[rank_diff + i] == 1 || input1_dims_[i] == 1 ||
+            input0_dims_[rank_diff + i] == input1_dims_[i],
+                     "Element-Wise op only support tail dimensions broadcast");
+      }
+    }
+    if (nchw_ && input1_dim_size_ > 0) {
+      MACE_RETURN_IF_ERROR(
+          ResizeOutputShape(OUTPUT, input0_dim_size_, input0_dims_));
+      DstType *output_ptr = reinterpret_cast<DstType *>(output_);
+      if (input1_size < input0_size) {
+        TensorEltwisePerChannel(type_,
+                                input0_,
+                                input1_,
+                                input0_dims_[0],
+                                input1_dim_size_ == 1 ? 1 : input1_dims_[0],
+                                input0_dims_[1],
+                                input0_dims_[2] * input0_dims_[3],
+                                swapped,
+                                output_ptr);
+      } else {
+        TensorEltwise(type_, input0_, input1_, input0_size,
+                      swapped, output_ptr);
+      }
+    } else {
+      ScratchBuffer scratch_buffer(engine_config_);
+      int32_t *input1_shape =
+          scratch_buffer.GetBuffer<int32_t>(input0_dim_size_);
+      if (rank_diff > 0) {
+        base::memset(input1_shape, static_cast<int32_t>(1), rank_diff);
+      }
+      if (input1_dim_size_ > 0) {
+        base::memcpy(input1_shape + rank_diff, input1_dims_,
+                     input1_dim_size_ * sizeof(int32_t));
+      }
+      int32_t *output_shape =
+          scratch_buffer.GetBuffer<int32_t>(input0_dim_size_);
+      for (uint32_t i = 0; i < input0_dim_size_; ++i) {
+        output_shape[i] = base::max(input0_dims_[i], input1_shape[i]);
+      }
+      MACE_RETURN_IF_ERROR(
+          ResizeOutputShape(OUTPUT, input0_dim_size_, output_shape));
+      DstType *output_ptr = reinterpret_cast<DstType *>(output_);
+      bool need_general_broadcast = false;
+      for (uint32_t i = 0; i < input1_dim_size_; ++i) {
+        if ((input0_dims_[rank_diff + i] == 1 && input1_dims_[i] > 1) ||
+            (input0_dims_[rank_diff + i] > 1 && input1_dims_[i] == 1)) {
+          need_general_broadcast = true;
+          break;
+        }
+      }
+      if (input1_size == 1) {
+        TensorScalarEltwise(type_, input0_, input1_[0],
+                            input0_size, swapped, output_ptr);
+      } else if (eltwise::ShapeIsEqual(input0_dims_,
+                                       input1_shape,
+                                       input0_dim_size_)) {
+        TensorEltwise(type_, input0_, input1_, input0_size,
+                      swapped, output_ptr);
+      } else if (need_general_broadcast) {
+        int32_t *out_index =
+            scratch_buffer.GetBuffer<int32_t>(input0_dim_size_);
+        TensorGeneralBroadcastEltwise(type_, input0_, input1_, input0_dim_size_,
+                                      swapped, input0_dims_, input1_shape,
+                                      output_shape, out_index, output_ptr);
+      } else {
+        int32_t common_size = input1_size;
+        int32_t diff_size = input0_size / common_size;
+        TensorBroadcastEltwise(type_, input0_, input1_,
+                               diff_size, common_size, swapped, output_ptr);
+      }
+    }
+    return MACE_SUCCESS;
+  }
+  template<typename DstType>
+  inline void TensorGeneralBroadcastEltwise(
+      const eltwise::Type type,
+      const T *input0,
+      const T *input1,
+      const uint32_t dim_size,
+      const bool swapped,
+      const int32_t *input0_shape,
+      const int32_t *input1_shape,
+      const int32_t *output_shape,
+      int32_t *out_index,
+      DstType *output) {
+    const int32_t output_size = base::GetShapeSize(dim_size, output_shape);
+    base::memset(out_index, static_cast<int32_t>(0), dim_size);
+    switch (type) {
+      case eltwise::SUM:
+        if (coeff_size_ == 0) {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = input0[idx0] + input1[idx1];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        } else {
+          float coeff_copy[2] = {coeff_[0], coeff_[1]};
+          if (swapped) {
+            base::swap(coeff_copy, coeff_copy + 1);
+          }
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] =
+                input0[idx0] * coeff_copy[0] + input1[idx1] * coeff_copy[1];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        }
+        break;
+      case eltwise::SUB:
+        if (!swapped) {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = input0[idx0] - input1[idx1];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        } else {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = input1[idx1] - input0[idx0];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        }
+        break;
+      case eltwise::PROD:
+        for (int32_t i = 0; i < output_size; ++i) {
+          const int32_t idx0 =
+              eltwise::GetIndex(input0_shape, out_index, dim_size);
+          const int32_t idx1 =
+              eltwise::GetIndex(input1_shape, out_index, dim_size);
+          output[i] = input0[idx0] * input1[idx1];
+          eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+        }
+        break;
+      case eltwise::DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = input0[idx0] / input1[idx1];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        } else {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = input1[idx1] / input0[idx0];
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        }
+        break;
+      case eltwise::FLOOR_DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = base::floor(input0[idx0] / input1[idx1]);
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        } else {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = base::floor(input1[idx1] / input0[idx0]);
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        }
+        break;
+      case eltwise::MIN:
+        for (int32_t i = 0; i < output_size; ++i) {
+          const int32_t idx0 =
+              eltwise::GetIndex(input0_shape, out_index, dim_size);
+          const int32_t idx1 =
+              eltwise::GetIndex(input1_shape, out_index, dim_size);
+          output[i] = base::min(input1[idx1], input0[idx0]);
+          eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+        }
+        break;
+      case eltwise::MAX:
+        for (int32_t i = 0; i < output_size; ++i) {
+          const int32_t idx0 =
+              eltwise::GetIndex(input0_shape, out_index, dim_size);
+          const int32_t idx1 =
+              eltwise::GetIndex(input1_shape, out_index, dim_size);
+          output[i] = base::max(input1[idx1], input0[idx0]);
+          eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+        }
+        break;
+      case eltwise::SQR_DIFF:
+        for (int32_t i = 0; i < output_size; ++i) {
+          const int32_t idx0 =
+              eltwise::GetIndex(input0_shape, out_index, dim_size);
+          const int32_t idx1 =
+              eltwise::GetIndex(input1_shape, out_index, dim_size);
+          output[i] = base::pow(input1[idx1] - input0[idx0], 2.f);
+          eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+        }
+        break;
+      case eltwise::POW:
+        if (!swapped) {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = base::pow(input0[idx0], input1[idx1]);
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        } else {
+          for (int32_t i = 0; i < output_size; ++i) {
+            const int32_t idx0 =
+                eltwise::GetIndex(input0_shape, out_index, dim_size);
+            const int32_t idx1 =
+                eltwise::GetIndex(input1_shape, out_index, dim_size);
+            output[i] = base::pow(input1[idx1], input0[idx0]);
+            eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+          }
+        }
+        break;
+      case eltwise::EQUAL:
+        for (int32_t i = 0; i < output_size; ++i) {
+          const int32_t idx0 =
+              eltwise::GetIndex(input0_shape, out_index, dim_size);
+          const int32_t idx1 =
+              eltwise::GetIndex(input1_shape, out_index, dim_size);
+          output[i] = input1[idx1] == input0[idx0];
+          eltwise::IncreaseIndex(output_shape, &out_index, dim_size);
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type "
+                         << static_cast<int32_t>(type);
+    }
+  }
+  template<typename DstType>
+  inline void TensorBroadcastEltwise(const eltwise::Type type,
+                                     const T *input0,
+                                     const T *input1,
+                                     const int32_t diff_size,
+                                     const int32_t common_size,
+                                     const bool swapped,
+                                     DstType *output) {
+    switch (type) {
+      case eltwise::SUM:
+        if (coeff_size_ == 0) {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] + input1[i];
+            }
+          }
+        } else {
+          float coeff_copy[2] = {coeff_[0], coeff_[1]};
+          if (swapped) {
+            base::swap(coeff_copy, coeff_copy + 1);
+          }
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] * coeff_copy[0] +
+                      input1[i] * coeff_copy[1];
+            }
+          }
+        }
+        break;
+      case eltwise::SUB:
+        if (!swapped) {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] - input1[i];
+            }
+          }
+        } else {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input1[i] - input0[i + d * common_size];
+            }
+          }
+        }
+        break;
+      case eltwise::PROD:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] * input1[i];
+          }
+        }
+        break;
+      case eltwise::DIV:
+        if (!swapped) {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] / input1[i];
+            }
+          }
+        } else {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  input1[i] / input0[i + d * common_size];
+            }
+          }
+        }
+        break;
+      case eltwise::FLOOR_DIV:
+        if (!swapped) {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  base::floor(input0[i + d * common_size] / input1[i]);
+            }
+          }
+        } else {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  base::floor(input1[i] / input0[i + d * common_size]);
+            }
+          }
+        }
+        break;
+      case eltwise::MIN:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                base::min(input0[i + d * common_size], input1[i]);
+          }
+        }
+        break;
+      case eltwise::MAX:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                base::max(input0[i + d * common_size], input1[i]);
+          }
+        }
+        break;
+      case eltwise::SQR_DIFF:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                base::pow(input0[i + d * common_size] - input1[i], 2.f);
+          }
+        }
+        break;
+      case eltwise::POW:
+        if (!swapped) {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  base::pow(input0[i + d * common_size], input1[i]);
+            }
+          }
+        } else {
+          for (int32_t d = 0; d < diff_size; ++d) {
+            for (int32_t i = 0; i < common_size; ++i) {
+              output[i + d * common_size] =
+                  base::pow(input1[i], input0[i + d * common_size]);
+            }
+          }
+        }
+        break;
+      case eltwise::NEG:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] = -input0[i + d * common_size];
+          }
+        }
+        break;
+      case eltwise::ABS:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                base::fabs(input0[i + d * common_size]);
+          }
+        }
+        break;
+      case eltwise::EQUAL:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] == input1[i];
+          }
+        }
+        break;
+      case eltwise::CLIP:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                base::max<float>(coeff_[0],
+                                 base::min<float>(coeff_[1],
+                                                  input0[i + d * common_size]));
+          }
+        }
+        break;
+      case eltwise::SIGN:
+        for (int32_t d = 0; d < diff_size; ++d) {
+          for (int32_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                eltwise::Sign(input0[i + d * common_size]);
+          }
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type "
+                         << static_cast<int32_t>(type);
+    }
+  }
+// Multiplication is costly, so we specialize the following case.
+  template<typename DstType>
+  inline void TensorEltwise(const eltwise::Type type,
+                            const T *input0,
+                            const T *input1,
+                            const int32_t size,
+                            const bool swapped,
+                            DstType *output) {
+    switch (type) {
+      case eltwise::SUM:
+        if (coeff_size_ == 0) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] + input1[i];
+          }
+        } else {
+          float coeff_copy[2] = {coeff_[0], coeff_[1]};
+          if (swapped) {
+            base::swap(coeff_copy, coeff_copy + 1);
+          }
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
+          }
+        }
+        break;
+      case eltwise::SUB:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] - input1[i];
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input1[i] - input0[i];
+          }
+        }
+        break;
+      case eltwise::PROD:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * input1[i];
+        }
+        break;
+      case eltwise::DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] / input1[i];
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input1[i] / input0[i];
+          }
+        }
+        break;
+      case eltwise::FLOOR_DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::floor(input0[i] / input1[i]);
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::floor(input1[i] / input0[i]);
+          }
+        }
+        break;
+      case eltwise::MIN:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::min(input0[i], input1[i]);
+        }
+        break;
+      case eltwise::MAX:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::max(input0[i], input1[i]);
+        }
+        break;
+      case eltwise::SQR_DIFF:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::pow(input0[i] - input1[i], 2.f);
+        }
+        break;
+      case eltwise::POW:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::pow(input0[i], input1[i]);
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::pow(input1[i], input0[i]);
+          }
+        }
+        break;
+      case eltwise::NEG:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = -input0[i];
+        }
+        break;
+      case eltwise::ABS:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::fabs(input0[i]);
+        }
+        break;
+      case eltwise::EQUAL:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = (input0[i] == input1[i]);
+        }
+        break;
+      case eltwise::CLIP:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::max<float>(
+              coeff_[0], base::min<float>(coeff_[1], input0[i]));
+        }
+        break;
+      case eltwise::SIGN:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = eltwise::Sign(input0[i]);
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type "
+                         << static_cast<int32_t>(type);
+    }
+  }
+// Multiplication is costly, so we specialize the following case.
+  template<typename DstType>
+  inline void TensorScalarEltwise(const eltwise::Type type,
+                                  const T *input0,
+                                  const T input1,
+                                  const int32_t size,
+                                  const bool swapped,
+                                  DstType *output) {
+    switch (type) {
+      case eltwise::SUM:
+        if (coeff_size_ == 0) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] + input1;
+          }
+        } else {
+          float coeff_copy[2] = {coeff_[0], coeff_[1]};
+          if (swapped) {
+            base::swap(coeff_copy, coeff_copy + 1);
+          }
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
+          }
+        }
+        break;
+      case eltwise::SUB:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] - input1;
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input1 - input0[i];
+          }
+        }
+        break;
+      case eltwise::PROD:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * input1;
+        }
+        break;
+      case eltwise::DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input0[i] / input1;
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = input1 / input0[i];
+          }
+        }
+        break;
+      case eltwise::FLOOR_DIV:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::floor(input0[i] / input1);
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::floor(input1 / input0[i]);
+          }
+        }
+        break;
+      case eltwise::MIN:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::min(input0[i], input1);
+        }
+        break;
+      case eltwise::MAX:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::max(input0[i], input1);
+        }
+        break;
+      case eltwise::SQR_DIFF:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::pow(input0[i] - input1, 2.f);
+        }
+        break;
+      case eltwise::POW:
+        if (!swapped) {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::pow(input0[i], input1);
+          }
+        } else {
+          for (int32_t i = 0; i < size; ++i) {
+            output[i] = base::pow(input1, input0[i]);
+          }
+        }
+        break;
+      case eltwise::NEG:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = -input0[i];
+        }
+        break;
+      case eltwise::ABS:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::fabs(input0[i]);
+        }
+        break;
+      case eltwise::EQUAL:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = (input0[i] == input1);
+        }
+        break;
+      case eltwise::CLIP:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = base::max<float>(coeff_[0],
+                                       base::min<float>(coeff_[1], input0[i]));
+        }
+        break;
+      case eltwise::SIGN:
+        for (int32_t i = 0; i < size; ++i) {
+          output[i] = eltwise::Sign(input0[i]);
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type "
+                         << static_cast<int32_t>(type);
+    }
+  }
+  template<typename DstType>
+  inline void TensorEltwisePerChannel(const eltwise::Type type,
+                                      const T *input0,
+                                      const T *input1,
+                                      const int32_t batch0,
+                                      const int32_t batch1,
+                                      const int32_t channel,
+                                      const int32_t image_size,
+                                      const bool swapped,
+                                      DstType *output) {
+    switch (type) {
+      case eltwise::SUM:
+        if (coeff_size_ == 0) {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+              }
+            }
+          }
+        } else {
+          float coeff_copy[2] = {coeff_[0], coeff_[1]};
+          if (swapped) {
+            base::swap(coeff_copy, coeff_copy + 1);  // NOLINT
+          }
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] =
+                    in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+              }
+            }
+          }
+        }
+        break;
+      case eltwise::SUB:
+        if (!swapped) {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] - in1_ptr[c];
+              }
+            }
+          }
+        } else {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in1_ptr[c] - in0_ptr[i];
+              }
+            }
+          }
+        }
+        break;
+      case eltwise::PROD:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] * in1_ptr[c];
+            }
+          }
+        }
+        break;
+      case eltwise::DIV:
+        if (!swapped) {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] / in1_ptr[c];
+              }
+            }
+          }
+        } else {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in1_ptr[c] / in0_ptr[i];
+              }
+            }
+          }
+        }
+        break;
+      case eltwise::FLOOR_DIV:
+        if (!swapped) {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = base::floor(in0_ptr[i] / in1_ptr[c]);
+              }
+            }
+          }
+        } else {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = base::floor(in1_ptr[c] / in0_ptr[i]);
+              }
+            }
+          }
+        }
+        break;
+      case eltwise::MIN:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = base::min(in0_ptr[i], in1_ptr[c]);
+            }
+          }
+        }
+        break;
+      case eltwise::MAX:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = base::max(in0_ptr[i], in1_ptr[c]);  // NOLINT
+            }
+          }
+        }
+        break;
+      case eltwise::SQR_DIFF:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = base::pow(in0_ptr[i] - in1_ptr[c], 2.f);
+            }
+          }
+        }
+        break;
+      case eltwise::POW:
+        if (!swapped) {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = base::pow(in0_ptr[i], in1_ptr[c]);
+              }
+            }
+          }
+        } else {
+          for (int32_t b = 0; b < batch0; ++b) {
+            for (int32_t c = 0; c < channel; ++c) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (int32_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = base::pow(in1_ptr[c], in0_ptr[i]);
+              }
+            }
+          }
+        }
+        break;
+      case eltwise::NEG:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = -input0[i];
+            }
+          }
+        }
+        break;
+      case eltwise::ABS:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            for (int32_t i = 0; i < image_size; ++i) {
+              output[i] = base::fabs(input0[i]);
+            }
+          }
+        }
+        break;
+      case eltwise::EQUAL:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (int32_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] == in1_ptr[c];
+            }
+          }
+        }
+        break;
+      case eltwise::SIGN:
+        for (int32_t b = 0; b < batch0; ++b) {
+          for (int32_t c = 0; c < channel; ++c) {
+            for (int32_t i = 0; i < image_size; ++i) {
+              output[i] = eltwise::Sign(input0[i]);
+            }
+          }
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type "
+                         << static_cast<int32_t>(type);
+    }
+  }
+ private:
+  const T *input0_;
+  const int32_t *input0_dims_;
+  uint32_t input0_dim_size_;
+  const T *input1_;
+  const int32_t *input1_dims_;
+  uint32_t input1_dim_size_;
+  T *output_;
+  eltwise::Type type_;
+  const float *coeff_;
+  uint32_t coeff_size_;
+  T scalar_input_;
+  int32_t scalar_input_index_;
+  bool nchw_;
+  MACE_OP_INPUT_TAGS(INPUT0, INPUT1);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_ELTWISE_H_
--- a/micro/ops/expand_dims.cc
+++ b/micro/ops/expand_dims.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/expand_dims.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/argument.h"
+namespace micro {
+namespace ops {
+MaceStatus ExpandDimsOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  axis_ = GetArgByName("axis", static_cast<int32_t>(0));
+  if (axis_ < 0) {
+    axis_ += input_dim_size_ + 1;
+  }
+  MACE_ASSERT2(axis_ >= 0 && axis_ <= static_cast<int32_t>(input_dim_size_),
+               "axis is out of bound: ", axis_);
+  return MACE_SUCCESS;
+}
+MaceStatus ExpandDimsOp::Run() {
+  int32_t output_dim_size = input_dim_size_ + 1;
+  int32_t *output_dims =
+      ScratchBuffer(engine_config_).GetBuffer<int32_t>(output_dim_size);
+  for (int32_t i = 0; i < output_dim_size; ++i) {
+    if (i < axis_) {
+      output_dims[i] = input_dims_[i];
+    } else if (i == axis_) {
+      output_dims[i] = 1;
+    } else {
+      output_dims[i] = input_dims_[i - 1];
+    }
+  }
+  // TODO(luxuhui): optimize this method by reusing buffer
+  int32_t input_data_size = base::GetShapeSize(input_dim_size_, input_dims_);
+  base::memcpy(output_, input_, input_data_size * sizeof(mifloat));
+  return ResizeOutputShape(OUTPUT, output_dim_size, output_dims);
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/expand_dims.h
+++ b/micro/ops/expand_dims.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_EXPAND_DIMS_H_
+#define MICRO_OPS_EXPAND_DIMS_H_
+#include "micro/base/types.h"
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class ExpandDimsOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  mifloat *output_;
+  int32_t axis_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_EXPAND_DIMS_H_
--- a/micro/ops/matmul.cc
+++ b/micro/ops/matmul.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/matmul.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/argument.h"
+namespace micro {
+namespace ops {
+MaceStatus MatMulOp::OnInit() {
+  transpose_a_ = GetArgByName("transpose_a", false);
+  transpose_b_ = GetArgByName("transpose_b", false);
+  input_a_ = GetInputData<mifloat>(INPUT_A);
+  input_b_ = GetInputData<mifloat>(INPUT_B);
+  bias_ = GetInputSize() > 3 ? GetInputData<mifloat>(BIAS) : NULL;
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  input_a_dim_size_ = GetInputShapeDimSize(INPUT_A);
+  input_b_dim_size_ = GetInputShapeDimSize(INPUT_B);
+  input_a_dims_ = GetInputShapeDims(INPUT_A);
+  input_b_dims_ = GetInputShapeDims(INPUT_B);
+  MACE_ASSERT1(input_a_dim_size_ >= 2 && input_b_dim_size_ >= 2,
+               "rank should be greater than or equal to 2");
+  return MACE_SUCCESS;
+}
+MaceStatus MatMulOp::Run() {
+  MACE_ASSERT(Validate());
+  const int32_t lhs_rank = input_a_dim_size_;
+  const int32_t lhs_rows = input_a_dims_[lhs_rank - 2];
+  const int32_t lhs_cols = input_a_dims_[lhs_rank - 1];
+  const int32_t rhs_rank = input_b_dim_size_;
+  const int32_t rhs_rows = input_b_dims_[rhs_rank - 2];
+  const int32_t rhs_cols = input_b_dims_[rhs_rank - 1];
+  const int32_t rows = transpose_a_ ? lhs_cols : lhs_rows;
+  const int32_t cols = transpose_b_ ? rhs_rows : rhs_cols;
+  const int32_t depth = transpose_a_ ? lhs_rows : lhs_cols;
+  const int32_t lhs_batch =
+      base::accumulate_multi(input_a_dims_, 0, input_a_dim_size_ - 2);
+  const int32_t rhs_batch =
+      base::accumulate_multi(input_b_dims_, 0, input_b_dim_size_ - 2);
+  int32_t *output_dims =
+      ScratchBuffer(engine_config_).GetBuffer<int32_t>(input_a_dim_size_);
+  int32_t batch = 1;
+  base::memcpy(output_dims, input_a_dims_, input_a_dim_size_);
+  if (lhs_rank >= rhs_rank) {
+    output_dims[lhs_rank - 2] = rows;
+    output_dims[lhs_rank - 1] = cols;
+    batch = lhs_batch;
+  } else {
+    output_dims[rhs_rank - 2] = rows;
+    output_dims[rhs_rank - 1] = cols;
+    batch = rhs_batch;
+  }
+  bool lhs_batched = true;
+  bool rhs_batched = true;
+  if (lhs_rank < rhs_rank) {
+    lhs_batched = false;
+  } else if (rhs_rank < lhs_rank) {
+    rhs_batched = false;
+  }
+  MACE_RETURN_IF_ERROR(
+      ResizeOutputShape(OUTPUT, input_a_dim_size_, output_dims));
+  if (rows == 1 && transpose_b_) {
+    return gemv_.Compute(input_b_,
+                         input_a_,
+                         bias_,
+                         batch,
+                         cols,
+                         depth,
+                         rhs_batched,
+                         lhs_batched,
+                         output_);
+  } else if (cols == 1 && !transpose_a_) {
+    return gemv_.Compute(input_a_,
+                         input_b_,
+                         bias_,
+                         batch,
+                         rows,
+                         depth,
+                         lhs_batched,
+                         rhs_batched,
+                         output_);
+  } else {
+    MaceStatus ret = gemm_.Compute(input_a_,
+                                   input_b_,
+                                   batch,
+                                   lhs_rows,
+                                   lhs_cols,
+                                   rhs_rows,
+                                   rhs_cols,
+                                   transpose_a_,
+                                   transpose_b_,
+                                   false,
+                                   lhs_batched,
+                                   rhs_batched,
+                                   output_);
+    if (bias_ != NULL) {
+      MACE_ASSERT1(bias_dim_size_ == 1 && bias_dims_[0] == cols,
+                   "bias' dim should be <= 2.");
+      for (int32_t i = 0; i < batch * rows; ++i) {
+        for (int32_t w = 0; w < cols; ++w) {
+          int32_t idx = i * cols + w;
+          output_[idx] = output_[idx] + bias_[w];
+        }
+      }
+    }
+    return ret;
+  }
+}
+bool MatMulOp::Validate() {
+  const int32_t lhs_rank = input_a_dim_size_;
+  const int32_t rhs_rank = input_b_dim_size_;
+  if (input_a_dim_size_ == input_b_dim_size_) {
+    for (uint32_t i = 0; i < input_a_dim_size_ - 2; ++i) {
+      MACE_ASSERT1(input_a_dims_[i] == input_b_dims_[i],
+                   "batch dimensions are not equal");
+    }
+  } else {
+    MACE_ASSERT1(input_a_dim_size_ == 2 || input_b_dim_size_ == 2,
+                 "Either lhs or rhs matrix should has rank 2 "
+                 "for non-batched matrix multiplication");
+  }
+  int32_t lhs_depth = transpose_a_ ? input_a_dims_[lhs_rank - 2] :
+                      input_a_dims_[lhs_rank - 1];
+  int32_t rhs_depth = transpose_b_ ? input_b_dims_[rhs_rank - 1] :
+                      input_b_dims_[rhs_rank - 2];
+  if (lhs_depth != rhs_depth) {
+    MACE_ASSERT1(false, "the number of A's column must be equal to B's row ");
+    return false;
+  }
+  return true;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/matmul.h
+++ b/micro/ops/matmul.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_MATMUL_H_
+#define MICRO_OPS_MATMUL_H_
+#include "micro/framework/operator.h"
+#include "micro/ops/utils/gemv.h"
+#include "micro/ops/utils/gemm.h"
+namespace micro {
+namespace ops {
+class MatMulOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  bool Validate();
+ private:
+  const mifloat *input_a_;
+  const int32_t *input_a_dims_;
+  uint32_t input_a_dim_size_;
+  const mifloat *input_b_;
+  const int32_t *input_b_dims_;
+  uint32_t input_b_dim_size_;
+  const mifloat *bias_;
+#ifndef NDEBUG
+  const int32_t *bias_dims_;
+  uint32_t bias_dim_size_;
+#endif
+  mifloat *output_;
+  bool transpose_a_;
+  bool transpose_b_;
+  Gemv<mifloat> gemv_;
+  Gemm<mifloat> gemm_;
+  MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_MATMUL_H_
--- a/micro/ops/nhwc/base/conv_2d_base.cc
+++ b/micro/ops/nhwc/base/conv_2d_base.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/include/utils/macros.h"
+#include "micro/model/operator_def.h"
+#include "micro/ops/utils/crumb_utils.h"
+namespace micro {
+namespace ops {
+MaceStatus Conv2dBase::OnInit() {
+  MACE_ASSERT1(static_cast<DataFormat>(
+                   GetArgByName("data_format",
+                                static_cast<int32_t>(NHWC)))
+                   != NCHW, "Only support NHWC");
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  filter_ = GetInputData<mifloat>(FILTER);
+  filter_dims_ = GetInputShapeDims(FILTER);
+  filter_dim_size_ = GetInputShapeDimSize(FILTER);
+  if (GetInputSize() >= 3) {
+    bias_ = GetInputData<mifloat>(BIAS);
+    bias_dims_ = GetInputShapeDims(BIAS);
+    bias_dim_size_ = GetInputShapeDimSize(BIAS);
+  } else {
+    bias_ = NULL;
+  }
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  MACE_RETURN_IF_ERROR(activation_.Init(this));
+  return FilterOpBase::OnInitBase();
+}
+MaceStatus Conv2dBase::Run() {
+  int32_t output_dims[4] = {0};
+  InitPaddingAndOutputSize(input_dims_, filter_dims_, FLOOR, output_dims);
+  ResizeOutputShape(0, 4, output_dims);
+  MACE_RETURN_IF_ERROR(Compute(output_dims));
+  if (bias_ != NULL) {
+    MACE_RETURN_IF_ERROR(crumb::ComputeBias(
+        output_, output_dims, input_dim_size_, bias_, bias_dims_[0], output_));
+  }
+  MACE_RETURN_IF_ERROR(activation_.Compute(
+      output_, base::GetShapeSize(input_dim_size_, output_dims), output_));
+  return MACE_SUCCESS;
+}
+MaceStatus Conv2dBase::Compute(int32_t (&output_dims)[4]) {
+  MACE_NOT_IMPLEMENTED;
+  MACE_UNUSED(output_dims);
+  return MACE_RUNTIME_ERROR;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/base/conv_2d_base.h
+++ b/micro/ops/nhwc/base/conv_2d_base.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_BASE_CONV_2D_BASE_H_
+#define MICRO_OPS_NHWC_BASE_CONV_2D_BASE_H_
+#include "micro/ops/nhwc/base/filter_op_base.h"
+#include "micro/ops/utils/activation.h"
+namespace micro {
+namespace ops {
+class Conv2dBase : public FilterOpBase {
+ public:
+  virtual MaceStatus OnInit();
+  virtual MaceStatus Run();
+ protected:
+  virtual MaceStatus Compute(int32_t (&output_dims)[4]);
+ protected:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  const mifloat *filter_;
+  const int32_t *filter_dims_;
+  uint32_t filter_dim_size_;
+  const mifloat *bias_;
+  const int32_t *bias_dims_;
+  uint32_t bias_dim_size_;
+  mifloat *output_;
+  Activation activation_;
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_BASE_CONV_2D_BASE_H_
--- a/micro/ops/nhwc/base/depthwise_conv_2d_base.cc
+++ b/micro/ops/nhwc/base/depthwise_conv_2d_base.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/model/operator_def.h"
+#include "micro/ops/utils/crumb_utils.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dBase::Run() {
+  int32_t output_dims[4] = {0};
+  InitPaddingAndOutputSize(input_dims_, filter_dims_, FLOOR, output_dims);
+  output_dims[3] *= input_dims_[3];
+  ResizeOutputShape(0, 4, output_dims);
+  MACE_RETURN_IF_ERROR(Compute(output_dims));
+  if (bias_ != NULL) {
+    MACE_RETURN_IF_ERROR(crumb::ComputeBias(
+        output_, output_dims, input_dim_size_, bias_, bias_dims_[0], output_));
+  }
+  MACE_RETURN_IF_ERROR(activation_.Compute(
+      output_, base::GetShapeSize(input_dim_size_, output_dims), output_));
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/base/depthwise_conv_2d_base.h
+++ b/micro/ops/nhwc/base/depthwise_conv_2d_base.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_BASE_DEPTHWISE_CONV_2D_BASE_H_
+#define MICRO_OPS_NHWC_BASE_DEPTHWISE_CONV_2D_BASE_H_
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+#include "micro/ops/utils/activation.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dBase : public Conv2dBase {
+ public:
+  MaceStatus Run();
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_BASE_DEPTHWISE_CONV_2D_BASE_H_
--- a/micro/ops/nhwc/base/filter_op_base.cc
+++ b/micro/ops/nhwc/base/filter_op_base.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/base/filter_op_base.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/model/argument.h"
+namespace micro {
+namespace ops {
+MaceStatus FilterOpBase::OnInitBase() {
+  strides_ = GetRepeatArgByName<int32_t>("strides");
+  MACE_ASSERT(strides_ != NULL);
+  const int32_t *dilations = GetRepeatArgByName<int32_t>("dilations");
+  if (dilations == NULL) {
+    dilations_[0] = dilations_[1] = 1;
+  } else {
+    base::memcpy(dilations_, dilations, 2 * sizeof(int32_t));
+  }
+  const int32_t *padding_sizes = GetRepeatArgByName<int32_t>("padding_values");
+  if (padding_sizes == NULL) {
+    padding_type_ = static_cast<Padding>(GetArgByName(
+        "padding", static_cast<int32_t>(SAME)));
+  } else {
+    padding_type_ = NONE;
+    base::memcpy(padding_sizes_, padding_sizes, 2 * sizeof(int32_t));
+  }
+  return MACE_SUCCESS;
+}
+void FilterOpBase::InitPaddingAndOutputSize(const int32_t *input_dims,
+                                            const int32_t *filter_dims,
+                                            const RoundType round_type,
+                                            int32_t *output_dims) {
+  if (padding_type_ != NONE) {
+    CalcPaddingAndOutputSize(input_dims, filter_dims, output_dims);
+  } else {
+    CalcOutputSizeWithPaddingSize(
+        input_dims, filter_dims, round_type, output_dims);
+  }
+}
+void FilterOpBase::CalcPaddingAndOutputSize(const int32_t *input_dims,
+                                            const int32_t *filter_dims,
+                                            int32_t *output_dims) {
+  MACE_ASSERT1(dilations_[0] > 0 && dilations_[1] > 0,
+               "Invalid dilations, must >= 1");
+  MACE_ASSERT1((dilations_[0] == 1 || strides_[0] == 1) &&
+      (dilations_[1] == 1 || strides_[1] == 1),
+               "If dilations > 1, strides should be 1");
+  MACE_ASSERT(output_dims != NULL);
+  int32_t input_height = input_dims[1];
+  int32_t input_width = input_dims[2];
+  int32_t kernel_height = filter_dims[1];
+  int32_t kernel_width = filter_dims[2];
+  /*
+  * Convlution/pooling arithmetic:
+  * o = (i + 2 * p - k - (k - 1) * (d - 1)) / s + 1
+  * For details, see https://arxiv.org/pdf/1603.07285.pdf or
+  * http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
+  */
+  int32_t output_height = 0, output_width = 0;
+  int32_t output_channels = filter_dims[0];
+  int32_t k_extent_height = (kernel_height - 1) * dilations_[0] + 1;
+  int32_t k_extent_width = (kernel_width - 1) * dilations_[1] + 1;
+  switch (padding_type_) {
+    case VALID: {
+      output_height = (input_height - k_extent_height) / strides_[0] + 1;
+      output_width = (input_width - k_extent_width) / strides_[1] + 1;
+      break;
+    }
+    case SAME: {
+      output_height = (input_height - 1) / strides_[0] + 1;
+      output_width = (input_width - 1) / strides_[1] + 1;
+      break;
+    }
+    case FULL: {
+      output_height = (input_height + k_extent_height - 2) / strides_[0] + 1;
+      output_width = (input_width + k_extent_width - 2) / strides_[1] + 1;
+      break;
+    }
+    default: {
+      MACE_ASSERT2(false, "Unsupported padding type: ",
+                   static_cast<int32_t>(padding_type_));
+      break;
+    }
+  }
+  padding_sizes_[0] = base::max<int32_t>(
+      0, (output_height - 1) * strides_[0] + k_extent_height - input_height);
+  padding_sizes_[1] = base::max<int32_t>(
+      0, (output_width - 1) * strides_[1] + k_extent_width - input_width);
+  output_dims[0] = input_dims[0];
+  output_dims[1] = output_height;
+  output_dims[2] = output_width;
+  output_dims[3] = output_channels;
+}
+void FilterOpBase::CalcOutputSizeWithPaddingSize(const int32_t *input_dims,
+                                                 const int32_t *filter_dims,
+                                                 const RoundType round_type,
+                                                 int32_t *output_dims) {
+  MACE_ASSERT1(dilations_[0] > 0 && dilations_[1] > 0,
+               "Invalid dilations, must >= 1");
+  MACE_ASSERT1((dilations_[0] == 1 || strides_[0] == 1) &&
+      (dilations_[1] == 1 || strides_[1] == 1),
+               "If dilations > 1, strides should be 1");
+  MACE_ASSERT(output_dims != NULL);
+  int32_t input_height = input_dims[1];
+  int32_t input_width = input_dims[2];
+  int32_t kernel_height = filter_dims[1];
+  int32_t kernel_width = filter_dims[2];
+  int32_t output_channels = filter_dims[0];
+  float output_h_f = input_height + padding_sizes_[0] + padding_sizes_[0]
+      - (kernel_height - 1) * dilations_[0] - 1;
+  float output_w_f = input_width + padding_sizes_[1] + padding_sizes_[1]
+      - (kernel_width - 1) * dilations_[1] - 1;
+  int32_t output_height = 1;
+  int32_t output_width = 1;
+  if (round_type == FLOOR) {
+    output_height += static_cast<int32_t>(output_h_f / strides_[0]);
+    output_width += static_cast<int32_t>(output_w_f / strides_[1]);
+  } else {
+    output_height += base::ceil(output_h_f / strides_[0]);
+    output_width += base::ceil(output_w_f / strides_[1]);
+  }
+  output_dims[0] = input_dims[0];
+  output_dims[1] = output_height;
+  output_dims[2] = output_width;
+  output_dims[3] = output_channels;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/base/filter_op_base.h
+++ b/micro/ops/nhwc/base/filter_op_base.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_BASE_FILTER_OP_BASE_H_
+#define MICRO_OPS_NHWC_BASE_FILTER_OP_BASE_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+enum Padding {
+  VALID = 0,  // No padding
+  SAME = 1,   // Pads with half the filter size (rounded down) on both sides
+  FULL = 2,   // Pads with one less than the filter size on both sides
+  NONE,
+};
+enum RoundType {
+  FLOOR = 0,
+  CEIL = 1,
+};
+class FilterOpBase : public framework::Operator {
+ public:
+  MaceStatus OnInitBase();
+ protected:
+  void InitPaddingAndOutputSize(const int32_t *input_dims,
+                                const int32_t *filter_dims,
+                                const RoundType round_type,
+                                int32_t *output_dims);
+ private:
+  void CalcPaddingAndOutputSize(const int32_t *input_dims,
+                                const int32_t *filter_dims,
+                                int32_t *output_dims);
+  void CalcOutputSizeWithPaddingSize(const int32_t *input_dims,
+                                     const int32_t *filter_dims,
+                                     const RoundType round_type,
+                                     int32_t *output_dims);
+ protected:
+  Padding padding_type_;
+  const int32_t *strides_;
+  int32_t padding_sizes_[2];
+  int32_t dilations_[2];
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_BASE_FILTER_OP_BASE_H_
--- a/micro/ops/nhwc/base/pooling_base.cc
+++ b/micro/ops/nhwc/base/pooling_base.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/base/pooling_base.h"
+#include "micro/base/logging.h"
+#include "micro/include/utils/macros.h"
+#include "micro/ops/nhwc/base/filter_op_base.h"
+namespace micro {
+namespace ops {
+MaceStatus PoolingBase::OnInit() {
+  MACE_ASSERT1(static_cast<DataFormat>(
+                   GetArgByName("data_format",
+                                static_cast<int32_t>(NHWC)))
+                   != NCHW, "Only support NHWC");
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  output_dims_ = GetOutputShapeDims(OUTPUT);
+  output_dim_size_ = GetOutputShapeDimSize(OUTPUT);
+  kernel_ = GetRepeatArgByName<int32_t>("kernels");
+  MACE_ASSERT(kernel_ != NULL);
+  int32_t pooling_type =
+      GetArgByName("pooling_type", static_cast<int32_t>(AVG));
+  pooling_type_ = static_cast<PoolingType>(pooling_type);
+  int32_t round_type = GetArgByName("round_mode", static_cast<int32_t>(FLOOR));
+  round_type_ = static_cast<RoundType>(round_type);
+  filter_dims_[0] = filter_dims_[3] = input_dims_[3];
+  filter_dims_[1] = kernel_[0];
+  filter_dims_[2] = kernel_[1];
+  return FilterOpBase::OnInitBase();
+}
+MaceStatus PoolingBase::Run() {
+  int32_t output_dims[4] = {0};
+  InitPaddingAndOutputSize(input_dims_, filter_dims_, round_type_, output_dims);
+  ResizeOutputShape(OUTPUT, 4, output_dims);
+  int32_t pad_hw[2] = {padding_sizes_[0] / 2, padding_sizes_[1] / 2};
+  if (pooling_type_ == MAX) {
+    MaxPooling(input_, kernel_, strides_, dilations_, pad_hw);
+  } else if (pooling_type_ == AVG) {
+    AvgPooling(input_, kernel_, strides_, dilations_, pad_hw);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  return MACE_SUCCESS;
+}
+void PoolingBase::MaxPooling(const mifloat *input,
+                             const int32_t *filter_hw,
+                             const int32_t *stride_hw,
+                             const int32_t *dilation_hw,
+                             const int32_t *pad_hw) {
+  MACE_UNUSED(input);
+  MACE_UNUSED(filter_hw);
+  MACE_UNUSED(stride_hw);
+  MACE_UNUSED(dilation_hw);
+  MACE_UNUSED(pad_hw);
+  MACE_NOT_IMPLEMENTED;
+}
+void PoolingBase::AvgPooling(const mifloat *input,
+                             const int32_t *filter_hw,
+                             const int32_t *stride_hw,
+                             const int32_t *dilation_hw,
+                             const int32_t *pad_hw) {
+  MACE_UNUSED(input);
+  MACE_UNUSED(filter_hw);
+  MACE_UNUSED(stride_hw);
+  MACE_UNUSED(dilation_hw);
+  MACE_UNUSED(pad_hw);
+  MACE_NOT_IMPLEMENTED;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/base/pooling_base.h
+++ b/micro/ops/nhwc/base/pooling_base.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_BASE_POOLING_BASE_H_
+#define MICRO_OPS_NHWC_BASE_POOLING_BASE_H_
+#include "micro/model/output_shape.h"
+#include "micro/ops/nhwc/base/filter_op_base.h"
+namespace micro {
+namespace ops {
+enum PoolingType {
+  AVG = 1,  // avg_pool
+  MAX = 2,  // max_pool
+};
+class PoolingBase : public FilterOpBase {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ protected:
+  virtual void MaxPooling(const mifloat *input, const int32_t *filter_hw,
+                          const int32_t *stride_hw, const int32_t *dilation_hw,
+                          const int32_t *pad_hw);
+  virtual void AvgPooling(const mifloat *input, const int32_t *filter_hw,
+                          const int32_t *stride_hw, const int32_t *dilation_hw,
+                          const int32_t *pad_hw);
+ protected:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  mifloat *output_;
+  const int32_t *output_dims_;
+  uint32_t output_dim_size_;
+  const int32_t *kernel_;
+  int32_t filter_dims_[4];
+  RoundType round_type_;
+  PoolingType pooling_type_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_BASE_POOLING_BASE_H_
--- a/micro/ops/nhwc/batch_norm.cc
+++ b/micro/ops/nhwc/batch_norm.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/batch_norm.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus BatchNormOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  scale_ = GetInputData<mifloat>(SCALE);
+  scale_dims_ = GetInputShapeDims(SCALE);
+  scale_dim_size_ = GetInputShapeDimSize(SCALE);
+  offset_ = GetInputData<mifloat>(OFFSET);
+  offset_dims_ = GetInputShapeDims(OFFSET);
+  offset_dim_size_ = GetInputShapeDimSize(OFFSET);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  MACE_ASSERT(input_dim_size_ >= 1);
+  MACE_ASSERT1(scale_dim_size_ == 1, "scale must be 1-dimensional. ");
+  MACE_ASSERT1(offset_dim_size_ == 1, "offset must be 1-dimensional. ");
+  epsilon_ = GetArgByName("epsilon", static_cast<float>(1e-4));
+  MACE_RETURN_IF_ERROR(activation_.Init(this));
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+  return MACE_SUCCESS;
+}
+MaceStatus BatchNormOp::Run() {
+  const mifloat *scale = scale_;
+  const mifloat *offset = offset_;
+  const uint32_t input_dim_end_idx = input_dim_size_ - 1;
+  const int32_t channels = input_dims_[input_dim_end_idx];
+  const int32_t batch =
+      base::accumulate_multi(input_dims_, 0, input_dim_end_idx);
+  if (GetInputSize() == 5) {
+    const float *mean = GetInputData<float>(MEAN);
+    const float *var = GetInputData<float>(VAR);
+    MACE_ASSERT1(GetInputShapeDimSize(MEAN) == 1,
+                 "mean must be 1-dimensional. ");
+    MACE_ASSERT1(GetInputShapeDimSize(VAR) == 1, "var must be 1-dimensional. ");
+    ScratchBuffer scratch_buffer(engine_config_);
+    mifloat *new_scale = scratch_buffer.GetBuffer<mifloat>(channels);
+    mifloat *new_offset = scratch_buffer.GetBuffer<mifloat>(channels);
+    for (int32_t c = 0; c < channels; ++c) {
+      new_scale[c] = scale_[c] / base::sqrt(var[c] + epsilon_);
+      new_offset[c] = offset_[c] - mean[c] * new_scale[c];
+    }
+    scale = new_scale;
+    offset = new_offset;
+  }
+  for (int32_t b = 0; b < batch; ++b) {
+    const int32_t batch_base = b * channels;
+    for (int32_t c = 0; c < channels; ++c) {
+      output_[batch_base + c] =
+          input_[batch_base + c] * scale[c] + offset[c];
+    }  // c
+  }  // b
+  MACE_RETURN_IF_ERROR(activation_.Compute(output_, batch * channels, output_));
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/batch_norm.h
+++ b/micro/ops/nhwc/batch_norm.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_BATCH_NORM_H_
+#define MICRO_OPS_NHWC_BATCH_NORM_H_
+#include "micro/framework/operator.h"
+#include "micro/ops/utils/activation.h"
+namespace micro {
+namespace ops {
+class BatchNormOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  const mifloat *scale_;
+  const int32_t *scale_dims_;
+  uint32_t scale_dim_size_;
+  const mifloat *offset_;
+  const int32_t *offset_dims_;
+  uint32_t offset_dim_size_;
+  mifloat *output_;
+  float epsilon_;
+  Activation activation_;
+  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_BATCH_NORM_H_
--- a/micro/ops/nhwc/conv_2d_c2_s4.cc
+++ b/micro/ops/nhwc/conv_2d_c2_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/conv_2d_c2_s4.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus Conv2dC2S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  MACE_ASSERT(channel == 2);
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(filter_dims_[0] == channel && input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size_end;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < channel; kb += 2) {
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      float output[2 * 4] = {0};
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t in_w_base[4] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output[0] += input0 * filter0;
+              output[1] += input0 * filter1;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output[2] += input1 * filter0;
+              output[3] += input1 * filter1;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output[4] += input2 * filter0;
+              output[5] += input2 * filter1;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output[6] += input3 * filter0;
+              output[7] += input3 * filter1;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        int32_t out_idx = width_base[i] + kb;
+        int32_t buf_idx = i * 2;
+        output_[out_idx] = output[buf_idx];
+        output_[out_idx + 1] = output[buf_idx + 1];
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/conv_2d_c2_s4.h
+++ b/micro/ops/nhwc/conv_2d_c2_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_CONV_2D_C2_S4_H_
+#define MICRO_OPS_NHWC_CONV_2D_C2_S4_H_
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+namespace micro {
+namespace ops {
+class Conv2dC2S4Op : public Conv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_CONV_2D_C2_S4_H_
--- a/micro/ops/nhwc/conv_2d_c3_s4.cc
+++ b/micro/ops/nhwc/conv_2d_c3_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/conv_2d_c3_s4.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus Conv2dC3S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  MACE_ASSERT(channel == 3);
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(filter_dims_[0] == channel && input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size_end;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < channel; kb += 3) {
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      const int32_t k_batch_base2 = k_batch_base1 + k_height;
+      float output[3 * 4] = {0};
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t k_height_base2 = (k_batch_base2 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t k_width_base2 = (k_height_base2 + kw) * k_channel;
+          const int32_t in_w_base[4] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            float filter2 = filter_[k_width_base2 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output[0] += input0 * filter0;
+              output[1] += input0 * filter1;
+              output[2] += input0 * filter2;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output[3] += input1 * filter0;
+              output[4] += input1 * filter1;
+              output[5] += input1 * filter2;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output[6] += input2 * filter0;
+              output[7] += input2 * filter1;
+              output[8] += input2 * filter2;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output[9] += input3 * filter0;
+              output[10] += input3 * filter1;
+              output[11] += input3 * filter2;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        for (int32_t j = 0; j < 3; ++j) {
+          int32_t out_idx = width_base[i] + kb + j;
+          output_[out_idx] = output[i * 3 + j];
+        }
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/conv_2d_c3_s4.h
+++ b/micro/ops/nhwc/conv_2d_c3_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_CONV_2D_C3_S4_H_
+#define MICRO_OPS_NHWC_CONV_2D_C3_S4_H_
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+#include "micro/ops/utils/activation.h"
+namespace micro {
+namespace ops {
+class Conv2dC3S4Op : public Conv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_CONV_2D_C3_S4_H_
--- a/micro/ops/nhwc/conv_2d_c4_s4.cc
+++ b/micro/ops/nhwc/conv_2d_c4_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/conv_2d_c4_s4.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus Conv2dC4S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(filter_dims_[0] == channel && input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  const int32_t channel_end = channel - 4;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size_end;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < channel; kb += 4) {
+      if (kb > channel_end) {
+        kb = channel_end;
+      }
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      const int32_t k_batch_base2 = k_batch_base1 + k_height;
+      const int32_t k_batch_base3 = k_batch_base2 + k_height;
+      float output[4 * 4] = {0};
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t k_height_base2 = (k_batch_base2 + kh) * k_width;
+        const int32_t k_height_base3 = (k_batch_base3 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t k_width_base2 = (k_height_base2 + kw) * k_channel;
+          const int32_t k_width_base3 = (k_height_base3 + kw) * k_channel;
+          const int32_t in_w_base[4] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            float filter2 = filter_[k_width_base2 + kc];
+            float filter3 = filter_[k_width_base3 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output[0] += input0 * filter0;
+              output[1] += input0 * filter1;
+              output[2] += input0 * filter2;
+              output[3] += input0 * filter3;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output[4] += input1 * filter0;
+              output[5] += input1 * filter1;
+              output[6] += input1 * filter2;
+              output[7] += input1 * filter3;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output[8] += input2 * filter0;
+              output[9] += input2 * filter1;
+              output[10] += input2 * filter2;
+              output[11] += input2 * filter3;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output[12] += input3 * filter0;
+              output[13] += input3 * filter1;
+              output[14] += input3 * filter2;
+              output[15] += input3 * filter3;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        for (int32_t j = 0; j < 4; ++j) {
+          int32_t out_idx = width_base[i] + kb + j;
+          output_[out_idx] = output[i * 4 + j];
+        }
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/conv_2d_c4_s4.h
+++ b/micro/ops/nhwc/conv_2d_c4_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_CONV_2D_C4_S4_H_
+#define MICRO_OPS_NHWC_CONV_2D_C4_S4_H_
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+namespace micro {
+namespace ops {
+class Conv2dC4S4Op : public Conv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_CONV_2D_C4_S4_H_
--- a/micro/ops/nhwc/conv_2d_ref.cc
+++ b/micro/ops/nhwc/conv_2d_ref.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/conv_2d_ref.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus Conv2dRefOp::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(filter_dims_[0] == channel && input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  for (int32_t b = 0; b < batch; ++b) {
+    const int32_t batch_base = b * height;
+    for (int32_t h = 0; h < height; ++h) {
+      const int32_t height_base = (batch_base + h) * width;
+      const int32_t in_h = h * strides_[0] - pad_top;
+      for (int32_t w = 0; w < width; ++w) {
+        const int32_t width_base = (height_base + w) * channel;
+        const int32_t in_w = w * strides_[1] - pad_left;
+        for (int32_t kb = 0; kb < channel; ++kb) {
+          const int32_t o_idx = width_base + kb;
+          const int32_t k_batch_base = kb * k_height;
+          float output = 0;
+          for (int32_t kh = 0; kh < k_height; ++kh) {
+            const int32_t in_h_idx = in_h + kh * dilations_[0];
+            if (in_h_idx < 0 || in_h_idx >= in_height) {
+              continue;
+            }
+            const int32_t k_height_base = (k_batch_base + kh) * k_width;
+            const int32_t in_h_base = in_h_idx * in_width;
+            for (int32_t kw = 0; kw < k_width; ++kw) {
+              const int32_t in_w_idx = in_w + kw * dilations_[1];
+              if (in_w_idx < 0 || in_w_idx >= in_width) {
+                continue;
+              }
+              const int32_t k_width_base = (k_height_base + kw) * k_channel;
+              const int32_t in_w_base = (in_h_base + in_w_idx) * in_channel;
+              for (int32_t kc = 0; kc < k_channel; ++kc) {
+                output += input_[in_w_base + kc] * filter_[k_width_base + kc];
+              }  // filter channel
+            }  // filter width
+          }  // filter height
+          output_[o_idx] = output;
+        }  // filter batch, output channel
+      }  // output width
+    }  // output height
+  }  // output batch
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/conv_2d_ref.h
+++ b/micro/ops/nhwc/conv_2d_ref.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_CONV_2D_REF_H_
+#define MICRO_OPS_NHWC_CONV_2D_REF_H_
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+namespace micro {
+namespace ops {
+class Conv2dRefOp : public Conv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_CONV_2D_REF_H_
--- a/micro/ops/nhwc/depthwise_conv_2d_kb1_s4.cc
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb1_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dKB1S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(filter_dims_[0] == 1 && input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  int32_t output_size = k_channel * 4;
+  float *output = ScratchBuffer(engine_config_).GetBuffer<float>(output_size);
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size_end;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    base::memset<float>(output, 0.0f, output_size);
+    for (int32_t kh = 0; kh < k_height; ++kh) {
+      const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+      const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+      const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+      const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+      bool h_valid[4] = {true, true, true, true};
+      if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+        h_valid[0] = false;
+      }
+      if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+        h_valid[1] = false;
+      }
+      if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+        h_valid[2] = false;
+      }
+      if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+        h_valid[3] = false;
+      }
+      const int32_t k_height_base = kh * k_width;
+      const int32_t in_h_base0 = in_h_idx0 * in_width;
+      const int32_t in_h_base1 = in_h_idx1 * in_width;
+      const int32_t in_h_base2 = in_h_idx2 * in_width;
+      const int32_t in_h_base3 = in_h_idx3 * in_width;
+      for (int32_t kw = 0; kw < k_width; ++kw) {
+        const int32_t kw_dilations = kw * dilations_[1];
+        const int32_t in_w_idx0 = in_w0 + kw_dilations;
+        const int32_t in_w_idx1 = in_w1 + kw_dilations;
+        const int32_t in_w_idx2 = in_w2 + kw_dilations;
+        const int32_t in_w_idx3 = in_w3 + kw_dilations;
+        bool valid[4] = {
+            h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+        };
+        if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+          valid[0] = false;
+        }
+        if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+          valid[1] = false;
+        }
+        if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+          valid[2] = false;
+        }
+        if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+          valid[3] = false;
+        }
+        const int32_t k_width_base = (k_height_base + kw) * k_channel;
+        const int32_t in_w_base[] = {
+            (in_h_base0 + in_w_idx0) * in_channel,
+            (in_h_base1 + in_w_idx1) * in_channel,
+            (in_h_base2 + in_w_idx2) * in_channel,
+            (in_h_base3 + in_w_idx3) * in_channel
+        };
+        for (int32_t kc = 0; kc < k_channel; ++kc) {
+          float *output_kc = output + kc * 4;
+          float filter = filter_[k_width_base + kc];
+          if (valid[0]) {
+            float input0 = input_[in_w_base[0] + kc];
+            output_kc[0] += input0 * filter;
+          }
+          if (valid[1]) {
+            float input1 = input_[in_w_base[1] + kc];
+            output_kc[1] += input1 * filter;
+          }
+          if (valid[2]) {
+            float input2 = input_[in_w_base[2] + kc];
+            output_kc[2] += input2 * filter;
+          }
+          if (valid[3]) {
+            float input3 = input_[in_w_base[3] + kc];
+            output_kc[3] += input3 * filter;
+          }
+        }  // filter channel
+      }  // filter width
+    }  // filter height
+    for (int32_t i = 0; i < 4; ++i) {
+      int32_t out_base = width_base[i];
+      for (int32_t c_offset = 0, kc_offset = 0;
+           c_offset < channel; ++c_offset, kc_offset += 4) {
+        output_[out_base + c_offset] = output[kc_offset + i];
+      }
+    }
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB1_S4_H_
+#define MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB1_S4_H_
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dKB1S4Op : public DepthwiseConv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB1_S4_H_
--- a/micro/ops/nhwc/depthwise_conv_2d_kb2_s4.cc
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb2_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/depthwise_conv_2d_kb2_s4.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dKB2S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_batch = filter_dims_[0];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  const int32_t k_batch_end = k_batch - 2;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size - 4;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < k_batch; kb += 2) {
+      if (kb > k_batch_end) {
+        kb = k_batch - 2;
+      }
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      int32_t output_size = k_channel * 8;
+      float *output =
+          ScratchBuffer(engine_config_).GetBuffer<float>(output_size);
+      base::memset<float>(output, 0.0f, output_size);
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t in_w_base[] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float *output_kc = output + kc * 8;
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output_kc[0] += input0 * filter0;
+              output_kc[1] += input0 * filter1;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output_kc[2] += input1 * filter0;
+              output_kc[3] += input1 * filter1;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output_kc[4] += input2 * filter0;
+              output_kc[5] += input2 * filter1;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output_kc[6] += input3 * filter0;
+              output_kc[7] += input3 * filter1;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        for (int32_t j = 0; j < 2; ++j) {
+          int32_t out_base = width_base[i] + kb + j;
+          int32_t buf_offset = i * 2 + j;
+          for (int32_t c_offset = 0, kc_offset = 0;
+               c_offset < channel; c_offset += k_batch, kc_offset += 8) {
+            output_[out_base + c_offset] = output[kc_offset + buf_offset];
+          }
+        }
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/depthwise_conv_2d_kb2_s4.h
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb2_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB2_S4_H_
+#define MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB2_S4_H_
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dKB2S4Op : public DepthwiseConv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB2_S4_H_
--- a/micro/ops/nhwc/depthwise_conv_2d_kb3_s4.cc
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb3_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/depthwise_conv_2d_kb3_s4.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dKB3S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_batch = filter_dims_[0];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  const int32_t k_batch_end = k_batch - 3;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size - 4;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < k_batch; kb += 3) {
+      if (kb > k_batch_end) {
+        kb = k_batch - 3;
+      }
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      const int32_t k_batch_base2 = k_batch_base1 + k_height;
+      int32_t output_size = k_channel * 12;
+      float *output =
+          ScratchBuffer(engine_config_).GetBuffer<float>(output_size);
+      base::memset(output, 0.0f, output_size);
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t k_height_base2 = (k_batch_base2 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t k_width_base2 = (k_height_base2 + kw) * k_channel;
+          const int32_t in_w_base[] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float *output_kc = output + kc * 12;
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            float filter2 = filter_[k_width_base2 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output_kc[0] += input0 * filter0;
+              output_kc[1] += input0 * filter1;
+              output_kc[2] += input0 * filter2;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output_kc[3] += input1 * filter0;
+              output_kc[4] += input1 * filter1;
+              output_kc[5] += input1 * filter2;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output_kc[6] += input2 * filter0;
+              output_kc[7] += input2 * filter1;
+              output_kc[8] += input2 * filter2;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output_kc[9] += input3 * filter0;
+              output_kc[10] += input3 * filter1;
+              output_kc[11] += input3 * filter2;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        for (int32_t j = 0; j < 3; ++j) {
+          int32_t out_base = width_base[i] + kb + j;
+          int32_t buf_offset = i * 3 + j;
+          for (int32_t c_offset = 0, kc_offset = 0;
+               c_offset < channel; c_offset += k_batch, kc_offset += 12) {
+            output_[out_base + c_offset] = output[kc_offset + buf_offset];
+          }
+        }
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/depthwise_conv_2d_kb3_s4.h
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb3_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB3_S4_H_
+#define MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB3_S4_H_
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dKB3S4Op : public DepthwiseConv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB3_S4_H_
--- a/micro/ops/nhwc/depthwise_conv_2d_kb4_s4.cc
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb4_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dKB4S4Op::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_batch = filter_dims_[0];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  const int32_t size = batch * height * width;
+  const int32_t size_end = size - 4;
+  const int32_t k_batch_end = k_batch - 4;
+  for (int32_t s = 0; s < size; s += 4) {
+    if (s > size_end) {
+      s = size - 4;
+    }
+    int32_t h0 = s / width % height;
+    int32_t h1 = (s + 1) / width % height;
+    int32_t h2 = (s + 2) / width % height;
+    int32_t h3 = (s + 3) / width % height;
+    const int32_t in_h0 = h0 * strides_[0] - pad_top;
+    const int32_t in_h1 = h1 * strides_[0] - pad_top;
+    const int32_t in_h2 = h2 * strides_[0] - pad_top;
+    const int32_t in_h3 = h3 * strides_[0] - pad_top;
+    int32_t w0 = s % width;
+    int32_t w1 = (s + 1) % width;
+    int32_t w2 = (s + 2) % width;
+    int32_t w3 = (s + 3) % width;
+    int32_t width_base[4] = {s * channel};
+    width_base[1] = width_base[0] + channel;
+    width_base[2] = width_base[1] + channel;
+    width_base[3] = width_base[2] + channel;
+    const int32_t in_w0 = w0 * strides_[1] - pad_left;
+    const int32_t in_w1 = w1 * strides_[1] - pad_left;
+    const int32_t in_w2 = w2 * strides_[1] - pad_left;
+    const int32_t in_w3 = w3 * strides_[1] - pad_left;
+    for (int32_t kb = 0; kb < k_batch; kb += 4) {
+      if (kb > k_batch_end) {
+        kb = k_batch - 4;
+      }
+      const int32_t k_batch_base0 = kb * k_height;
+      const int32_t k_batch_base1 = k_batch_base0 + k_height;
+      const int32_t k_batch_base2 = k_batch_base1 + k_height;
+      const int32_t k_batch_base3 = k_batch_base2 + k_height;
+      int32_t output_size = k_channel * 16;
+      float *output =
+          ScratchBuffer(engine_config_).GetBuffer<float>(output_size);
+      base::memset(output, static_cast<float>(0.0f), output_size);
+      for (int32_t kh = 0; kh < k_height; ++kh) {
+        const int32_t in_h_idx0 = in_h0 + kh * dilations_[0];
+        const int32_t in_h_idx1 = in_h1 + kh * dilations_[0];
+        const int32_t in_h_idx2 = in_h2 + kh * dilations_[0];
+        const int32_t in_h_idx3 = in_h3 + kh * dilations_[0];
+        bool h_valid[4] = {true, true, true, true};
+        if (in_h_idx0 < 0 || in_h_idx0 >= in_height) {
+          h_valid[0] = false;
+        }
+        if (in_h_idx1 < 0 || in_h_idx1 >= in_height) {
+          h_valid[1] = false;
+        }
+        if (in_h_idx2 < 0 || in_h_idx2 >= in_height) {
+          h_valid[2] = false;
+        }
+        if (in_h_idx3 < 0 || in_h_idx3 >= in_height) {
+          h_valid[3] = false;
+        }
+        const int32_t k_height_base0 = (k_batch_base0 + kh) * k_width;
+        const int32_t k_height_base1 = (k_batch_base1 + kh) * k_width;
+        const int32_t k_height_base2 = (k_batch_base2 + kh) * k_width;
+        const int32_t k_height_base3 = (k_batch_base3 + kh) * k_width;
+        const int32_t in_h_base0 = in_h_idx0 * in_width;
+        const int32_t in_h_base1 = in_h_idx1 * in_width;
+        const int32_t in_h_base2 = in_h_idx2 * in_width;
+        const int32_t in_h_base3 = in_h_idx3 * in_width;
+        for (int32_t kw = 0; kw < k_width; ++kw) {
+          const int32_t kw_dilations = kw * dilations_[1];
+          const int32_t in_w_idx0 = in_w0 + kw_dilations;
+          const int32_t in_w_idx1 = in_w1 + kw_dilations;
+          const int32_t in_w_idx2 = in_w2 + kw_dilations;
+          const int32_t in_w_idx3 = in_w3 + kw_dilations;
+          bool valid[4] = {
+              h_valid[0], h_valid[1], h_valid[2], h_valid[3]
+          };
+          if (in_w_idx0 < 0 || in_w_idx0 >= in_width) {
+            valid[0] = false;
+          }
+          if (in_w_idx1 < 0 || in_w_idx1 >= in_width) {
+            valid[1] = false;
+          }
+          if (in_w_idx2 < 0 || in_w_idx2 >= in_width) {
+            valid[2] = false;
+          }
+          if (in_w_idx3 < 0 || in_w_idx3 >= in_width) {
+            valid[3] = false;
+          }
+          const int32_t k_width_base0 = (k_height_base0 + kw) * k_channel;
+          const int32_t k_width_base1 = (k_height_base1 + kw) * k_channel;
+          const int32_t k_width_base2 = (k_height_base2 + kw) * k_channel;
+          const int32_t k_width_base3 = (k_height_base3 + kw) * k_channel;
+          const int32_t in_w_base[4] = {
+              (in_h_base0 + in_w_idx0) * in_channel,
+              (in_h_base1 + in_w_idx1) * in_channel,
+              (in_h_base2 + in_w_idx2) * in_channel,
+              (in_h_base3 + in_w_idx3) * in_channel
+          };
+          for (int32_t kc = 0; kc < k_channel; ++kc) {
+            float *output_kc = output + kc * 16;
+            float filter0 = filter_[k_width_base0 + kc];
+            float filter1 = filter_[k_width_base1 + kc];
+            float filter2 = filter_[k_width_base2 + kc];
+            float filter3 = filter_[k_width_base3 + kc];
+            if (valid[0]) {
+              float input0 = input_[in_w_base[0] + kc];
+              output_kc[0] += input0 * filter0;
+              output_kc[1] += input0 * filter1;
+              output_kc[2] += input0 * filter2;
+              output_kc[3] += input0 * filter3;
+            }
+            if (valid[1]) {
+              float input1 = input_[in_w_base[1] + kc];
+              output_kc[4] += input1 * filter0;
+              output_kc[5] += input1 * filter1;
+              output_kc[6] += input1 * filter2;
+              output_kc[7] += input1 * filter3;
+            }
+            if (valid[2]) {
+              float input2 = input_[in_w_base[2] + kc];
+              output_kc[8] += input2 * filter0;
+              output_kc[9] += input2 * filter1;
+              output_kc[10] += input2 * filter2;
+              output_kc[11] += input2 * filter3;
+            }
+            if (valid[3]) {
+              float input3 = input_[in_w_base[3] + kc];
+              output_kc[12] += input3 * filter0;
+              output_kc[13] += input3 * filter1;
+              output_kc[14] += input3 * filter2;
+              output_kc[15] += input3 * filter3;
+            }
+          }  // filter channel
+        }  // filter width
+      }  // filter height
+      for (int32_t i = 0; i < 4; ++i) {
+        for (int32_t j = 0; j < 4; ++j) {
+          int32_t out_base = width_base[i] + kb + j;
+          int32_t buf_offset = i * 4 + j;
+          for (int32_t c_offset = 0, kc_offset = 0;
+               c_offset < channel; c_offset += k_batch, kc_offset += 16) {
+            output_[out_base + c_offset] = output[kc_offset + buf_offset];
+          }
+        }
+      }
+    }  // filter batch, output channel
+  }  // output size
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h
+++ b/micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB4_S4_H_
+#define MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB4_S4_H_
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dKB4S4Op : public DepthwiseConv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_KB4_S4_H_
--- a/micro/ops/nhwc/depthwise_conv_2d_ref.cc
+++ b/micro/ops/nhwc/depthwise_conv_2d_ref.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/depthwise_conv_2d_ref.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus DepthwiseConv2dRefOp::Compute(int32_t (&output_dims)[4]) {
+  const int32_t batch = output_dims[0];
+  const int32_t height = output_dims[1];
+  const int32_t width = output_dims[2];
+  const int32_t channel = output_dims[3];
+  const int32_t k_batch = filter_dims_[0];
+  const int32_t k_height = filter_dims_[1];
+  const int32_t k_width = filter_dims_[2];
+  const int32_t k_channel = filter_dims_[3];
+  MACE_ASSERT(input_dims_[3] == k_channel);
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t in_channel = input_dims_[3];
+  const int32_t pad_top = padding_sizes_[0] >> 1;
+  const int32_t pad_left = padding_sizes_[1] >> 1;
+  for (int32_t b = 0; b < batch; ++b) {
+    const int32_t batch_base = b * height;
+    for (int32_t h = 0; h < height; ++h) {
+      const int32_t height_base = (batch_base + h) * width;
+      const int32_t in_h = h * strides_[0] - pad_top;
+      for (int32_t w = 0; w < width; ++w) {
+        const int32_t width_base = (height_base + w) * channel;
+        const int32_t in_w = w * strides_[1] - pad_left;
+        for (int32_t oc = 0; oc < channel; ++oc) {
+          const int32_t kb = oc % k_batch;
+          const int32_t kc = oc / k_batch;
+          const int32_t o_idx = width_base + oc;
+          const int32_t k_batch_base = kb * k_height;
+          float output = 0;
+          for (int32_t kh = 0; kh < k_height; ++kh) {
+            const int32_t in_h_idx = in_h + kh * dilations_[0];
+            if (in_h_idx < 0 || in_h_idx >= in_height) {
+              continue;
+            }
+            const int32_t k_height_base = (k_batch_base + kh) * k_width;
+            const int32_t in_h_base = in_h_idx * in_width;
+            for (int32_t kw = 0; kw < k_width; ++kw) {
+              const int32_t in_w_idx = in_w + kw * dilations_[1];
+              if (in_w_idx < 0 || in_w_idx >= in_width) {
+                continue;
+              }
+              const int32_t k_width_base = (k_height_base + kw) * k_channel;
+              const int32_t in_w_base = (in_h_base + in_w_idx) * in_channel;
+              output += input_[in_w_base + kc] * filter_[k_width_base + kc];
+            }  // filter width
+          }  // filter height
+          output_[o_idx] = output;
+        }  // filter batch, output channel
+      }  // output width
+    }  // output height
+  }  // output batch
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/depthwise_conv_2d_ref.h
+++ b/micro/ops/nhwc/depthwise_conv_2d_ref.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_REF_H_
+#define MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_REF_H_
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+namespace micro {
+namespace ops {
+class DepthwiseConv2dRefOp : public DepthwiseConv2dBase {
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_DEPTHWISE_CONV_2D_REF_H_
--- a/micro/ops/nhwc/pooling_ref.cc
+++ b/micro/ops/nhwc/pooling_ref.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/pooling_ref.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+void PoolingRefOp::MaxPooling(const mifloat *input,
+                              const int32_t *filter_hw,
+                              const int32_t *stride_hw,
+                              const int32_t *dilation_hw,
+                              const int32_t *pad_hw) {
+  const int32_t batch = output_dims_[0];
+  const int32_t out_channels = output_dims_[3];
+  const int32_t out_height = output_dims_[1];
+  const int32_t out_width = output_dims_[2];
+  const int32_t in_channels = input_dims_[3];
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  float *max = ScratchBuffer(engine_config_).GetBuffer<float>(in_channels);
+  for (int32_t b = 0; b < batch; ++b) {
+    int32_t batch_base = b * out_height;
+    int32_t in_b_base = b * in_height;
+    for (int32_t h = 0; h < out_height; ++h) {
+      int32_t height_base = (batch_base + h) * out_width;
+      int32_t inh_addr = h * stride_hw[0] - pad_hw[0];
+      for (int32_t w = 0; w < out_width; ++w) {
+        int32_t width_base = (height_base + w) * out_channels;
+        int32_t inw_addr = w * stride_hw[1] - pad_hw[1];
+        for (int32_t c = 0; c < in_channels; ++c) {
+          max[c] = base::lowest();
+        }
+        for (int32_t fh = 0; fh < filter_hw[0]; ++fh) {
+          int32_t inh = inh_addr + dilation_hw[0] * fh;
+          if (inh < 0 && inh >= in_height) {
+            continue;
+          }
+          int32_t in_h_base = (in_b_base + inh) * in_width;
+          for (int32_t fw = 0; fw < filter_hw[1]; ++fw) {
+            int32_t inw = inw_addr + dilation_hw[1] * fw;
+            int32_t in_w_base = (in_h_base + inw) * in_channels;
+            for (int32_t c = 0; c < out_channels; ++c) {
+              if (inw >= 0 && inw < in_width) {
+                const int32_t input_offset = in_w_base + c;
+                float input_value = input[input_offset];
+                if (input_value > max[c]) {
+                  max[c] = input_value;
+                }
+              }
+            }
+          }
+        }
+        for (int i = 0; i < in_channels; ++i) {
+          output_[width_base + i] = max[i];
+        }
+      }
+    }
+  }
+}
+void PoolingRefOp::AvgPooling(const mifloat *input,
+                              const int32_t *filter_hw,
+                              const int32_t *stride_hw,
+                              const int32_t *dilation_hw,
+                              const int32_t *pad_hw) {
+  const int32_t batch = output_dims_[0];
+  const int32_t out_channels = output_dims_[3];
+  const int32_t out_height = output_dims_[1];
+  const int32_t out_width = output_dims_[2];
+  const int32_t in_channels = input_dims_[3];
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  ScratchBuffer scratch_buffer(engine_config_);
+  float *total = scratch_buffer.GetBuffer<float>(in_channels);
+  uint32_t *block_size = scratch_buffer.GetBuffer<uint32_t>(in_channels);
+  for (int32_t b = 0; b < batch; ++b) {
+    int32_t batch_base = b * out_height;
+    int32_t in_b_base = b * in_height;
+    for (int32_t h = 0; h < out_height; ++h) {
+      int32_t height_base = (batch_base + h) * out_width;
+      int32_t inh_addr = h * stride_hw[0] - pad_hw[0];
+      for (int32_t w = 0; w < out_width; ++w) {
+        int32_t width_base = (height_base + w) * out_channels;
+        int32_t inw_addr = w * stride_hw[1] - pad_hw[1];
+        for (int32_t c = 0; c < out_channels; ++c) {
+          total[c] = 0;
+          block_size[c] = 0;
+        }
+        for (int32_t fh = 0; fh < filter_hw[0]; ++fh) {
+          int32_t inh = inh_addr + dilation_hw[0] * fh;
+          int32_t in_h_base = (in_b_base + inh) * in_width;
+          for (int32_t fw = 0; fw < filter_hw[1]; ++fw) {
+            int32_t inw = inw_addr + dilation_hw[1] * fw;
+            int32_t in_w_base = (in_h_base + inw) * in_channels;
+            for (int32_t c = 0; c < out_channels; ++c) {
+              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                total[c] += input[in_w_base + c];
+                ++block_size[c];
+              }
+            }
+          }
+        }
+        for (int32_t c = 0; c < out_channels; ++c) {
+          output_[width_base + c] = total[c] / block_size[c];
+        }
+      }
+    }
+  }
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/pooling_ref.h
+++ b/micro/ops/nhwc/pooling_ref.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_POOLING_REF_H_
+#define MICRO_OPS_NHWC_POOLING_REF_H_
+#include "micro/model/output_shape.h"
+#include "micro/ops/nhwc/base/pooling_base.h"
+namespace micro {
+namespace ops {
+class PoolingRefOp : public PoolingBase {
+ private:
+  void MaxPooling(const mifloat *input, const int32_t *filter_hw,
+                  const int32_t *stride_hw, const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+  void AvgPooling(const mifloat *input, const int32_t *filter_hw,
+                  const int32_t *stride_hw, const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_POOLING_REF_H_
--- a/micro/ops/nhwc/pooling_s4.cc
+++ b/micro/ops/nhwc/pooling_s4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/nhwc/pooling_s4.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+void PoolingS4Op::MaxPooling(const mifloat *input,
+                             const int32_t *filter_hw,
+                             const int32_t *stride_hw,
+                             const int32_t *dilation_hw,
+                             const int32_t *pad_hw) {
+  const int32_t batch = output_dims_[0];
+  const int32_t out_channels = output_dims_[3];
+  const int32_t out_height = output_dims_[1];
+  const int32_t out_width = output_dims_[2];
+  const int32_t in_channels = input_dims_[3];
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t filter_size = filter_hw[0] * filter_hw[1];
+  const int32_t filter_size_end = filter_size - 4;
+  float *max = ScratchBuffer(engine_config_).GetBuffer<float>(in_channels);
+  for (int32_t b = 0; b < batch; ++b) {
+    int32_t batch_base = b * out_height;
+    int32_t in_b_base = b * in_height;
+    for (int32_t h = 0; h < out_height; ++h) {
+      int32_t height_base = (batch_base + h) * out_width;
+      int32_t inh_base = h * stride_hw[0] - pad_hw[0];
+      for (int32_t w = 0; w < out_width; ++w) {
+        int32_t width_base = (height_base + w) * out_channels;
+        int32_t inw_base = w * stride_hw[1] - pad_hw[1];
+        for (int32_t c = 0; c < in_channels; ++c) {
+          max[c] = base::lowest();
+        }
+        for (int32_t s = 0; s < filter_size; s += 4) {
+          if (s > filter_size_end) {
+            s = filter_size_end;
+          }
+          const int32_t s1 = s + 1;
+          const int32_t s2 = s1 + 1;
+          const int32_t s3 = s2 + 1;
+          int32_t fh0 = s / filter_hw[1];
+          int32_t fh1 = s1 / filter_hw[1];
+          int32_t fh2 = s2 / filter_hw[1];
+          int32_t fh3 = s3 / filter_hw[1];
+          int32_t fw0 = s % filter_hw[1];
+          int32_t fw1 = s1 % filter_hw[1];
+          int32_t fw2 = s2 % filter_hw[1];
+          int32_t fw3 = s3 % filter_hw[1];
+          int32_t inh0 = inh_base + dilation_hw[0] * fh0;
+          int32_t inh1 = inh_base + dilation_hw[0] * fh1;
+          int32_t inh2 = inh_base + dilation_hw[0] * fh2;
+          int32_t inh3 = inh_base + dilation_hw[0] * fh3;
+          int32_t inw0 = inw_base + dilation_hw[1] * fw0;
+          int32_t inw1 = inw_base + dilation_hw[1] * fw1;
+          int32_t inw2 = inw_base + dilation_hw[1] * fw2;
+          int32_t inw3 = inw_base + dilation_hw[1] * fw3;
+          bool valid[4] = {
+              inh0 >= 0 && inh0 < in_height && inw0 >= 0 && inw0 < in_width,
+              inh1 >= 0 && inh1 < in_height && inw1 >= 0 && inw1 < in_width,
+              inh2 >= 0 && inh2 < in_height && inw2 >= 0 && inw2 < in_width,
+              inh3 >= 0 && inh3 < in_height && inw3 >= 0 && inw3 < in_width
+          };
+          int32_t in_w_base0 =
+              ((in_b_base + inh0) * in_width + inw0) * in_channels;
+          int32_t in_w_base1 =
+              ((in_b_base + inh1) * in_width + inw1) * in_channels;
+          int32_t in_w_base2 =
+              ((in_b_base + inh2) * in_width + inw2) * in_channels;
+          int32_t in_w_base3 =
+              ((in_b_base + inh3) * in_width + inw3) * in_channels;
+          for (int32_t c = 0; c < out_channels; ++c) {
+            if (valid[0]) {
+              const int32_t input_offset0 = in_w_base0 + c;
+              float input_value = input[input_offset0];
+              if (input_value > max[c]) {
+                max[c] = input_value;
+              }
+            }
+            if (valid[1]) {
+              const int32_t input_offset1 = in_w_base1 + c;
+              float input_value = input[input_offset1];
+              if (input_value > max[c]) {
+                max[c] = input_value;
+              }
+            }
+            if (valid[2]) {
+              const int32_t input_offset2 = in_w_base2 + c;
+              float input_value = input[input_offset2];
+              if (input_value > max[c]) {
+                max[c] = input_value;
+              }
+            }
+            if (valid[3]) {
+              const int32_t input_offset3 = in_w_base3 + c;
+              float input_value = input[input_offset3];
+              if (input_value > max[c]) {
+                max[c] = input_value;
+              }
+            }
+          }
+        }
+        for (int i = 0; i < in_channels; ++i) {
+          output_[width_base + i] = max[i];
+        }
+      }
+    }
+  }
+}
+void PoolingS4Op::AvgPooling(const mifloat *input,
+                             const int32_t *filter_hw,
+                             const int32_t *stride_hw,
+                             const int32_t *dilation_hw,
+                             const int32_t *pad_hw) {
+  const int32_t batch = output_dims_[0];
+  const int32_t out_channels = output_dims_[3];
+  const int32_t out_height = output_dims_[1];
+  const int32_t out_width = output_dims_[2];
+  const int32_t in_channels = input_dims_[3];
+  const int32_t in_height = input_dims_[1];
+  const int32_t in_width = input_dims_[2];
+  const int32_t filter_size = filter_hw[0] * filter_hw[1];
+  const int32_t filter_size_end = filter_size - 4;
+  ScratchBuffer scratch_buffer(engine_config_);
+  float *total = scratch_buffer.GetBuffer<float>(in_channels);
+  uint32_t *block_size = scratch_buffer.GetBuffer<uint32_t>(in_channels);
+  for (int32_t b = 0; b < batch; ++b) {
+    int32_t batch_base = b * out_height;
+    int32_t in_b_base = b * in_height;
+    for (int32_t h = 0; h < out_height; ++h) {
+      int32_t height_base = (batch_base + h) * out_width;
+      int32_t inh_base = h * stride_hw[0] - pad_hw[0];
+      for (int32_t w = 0; w < out_width; ++w) {
+        int32_t width_base = (height_base + w) * out_channels;
+        int32_t inw_base = w * stride_hw[1] - pad_hw[1];
+        for (int32_t c = 0; c < in_channels; ++c) {
+          total[c] = 0;
+          block_size[c] = 0;
+        }
+        for (int32_t s = 0; s < filter_size; s += 4) {
+          if (s > filter_size_end) {
+            s = filter_size_end;
+          }
+          const int32_t s1 = s + 1;
+          const int32_t s2 = s1 + 1;
+          const int32_t s3 = s2 + 1;
+          int32_t fh0 = s / filter_hw[1];
+          int32_t fh1 = s1 / filter_hw[1];
+          int32_t fh2 = s2 / filter_hw[1];
+          int32_t fh3 = s3 / filter_hw[1];
+          int32_t fw0 = s % filter_hw[1];
+          int32_t fw1 = s1 % filter_hw[1];
+          int32_t fw2 = s2 % filter_hw[1];
+          int32_t fw3 = s3 % filter_hw[1];
+          int32_t inh0 = inh_base + dilation_hw[0] * fh0;
+          int32_t inh1 = inh_base + dilation_hw[0] * fh1;
+          int32_t inh2 = inh_base + dilation_hw[0] * fh2;
+          int32_t inh3 = inh_base + dilation_hw[0] * fh3;
+          int32_t inw0 = inw_base + dilation_hw[1] * fw0;
+          int32_t inw1 = inw_base + dilation_hw[1] * fw1;
+          int32_t inw2 = inw_base + dilation_hw[1] * fw2;
+          int32_t inw3 = inw_base + dilation_hw[1] * fw3;
+          bool valid[4] = {
+              inh0 >= 0 && inh0 < in_height && inw0 >= 0 && inw0 < in_width,
+              inh1 >= 0 && inh1 < in_height && inw1 >= 0 && inw1 < in_width,
+              inh2 >= 0 && inh2 < in_height && inw2 >= 0 && inw2 < in_width,
+              inh3 >= 0 && inh3 < in_height && inw3 >= 0 && inw3 < in_width
+          };
+          int32_t in_w_base0 =
+              ((in_b_base + inh0) * in_width + inw0) * in_channels;
+          int32_t in_w_base1 =
+              ((in_b_base + inh1) * in_width + inw1) * in_channels;
+          int32_t in_w_base2 =
+              ((in_b_base + inh2) * in_width + inw2) * in_channels;
+          int32_t in_w_base3 =
+              ((in_b_base + inh3) * in_width + inw3) * in_channels;
+          int32_t block_num = valid[0] + valid[1] + valid[2] + valid[3];
+          for (int32_t c = 0; c < out_channels; ++c) {
+            float total_c = 0;
+            if (valid[0]) {
+              total_c += input[in_w_base0 + c];
+            }
+            if (valid[1]) {
+              total_c += input[in_w_base1 + c];
+            }
+            if (valid[2]) {
+              total_c += input[in_w_base2 + c];
+            }
+            if (valid[3]) {
+              total_c += input[in_w_base3 + c];
+            }
+            total[c] += total_c;
+            block_size[c] += block_num;
+          }
+        }
+        for (int32_t c = 0; c < out_channels; ++c) {
+          output_[width_base + c] = total[c] / block_size[c];
+        }
+      }
+    }
+  }
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/pooling_s4.h
+++ b/micro/ops/nhwc/pooling_s4.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_NHWC_POOLING_S4_H_
+#define MICRO_OPS_NHWC_POOLING_S4_H_
+#include "micro/model/output_shape.h"
+#include "micro/ops/nhwc/base/pooling_base.h"
+namespace micro {
+namespace ops {
+class PoolingS4Op : public PoolingBase {
+ private:
+  void MaxPooling(const mifloat *input, const int32_t *filter_hw,
+                  const int32_t *stride_hw, const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+  void AvgPooling(const mifloat *input, const int32_t *filter_hw,
+                  const int32_t *stride_hw, const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_NHWC_POOLING_S4_H_
--- a/micro/ops/reduce.cc
+++ b/micro/ops/reduce.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/reduce.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus ReduceOpBase::OnInit() {
+  reduce_type_ = static_cast<ReduceType>(
+      GetArgByName("reduce_type", static_cast<int32_t>(MEAN)));
+  axis_ = GetRepeatArgByName<int32_t>("axis", &axis_size_);
+  keep_dims_ = GetArgByName("keepdims", false);
+  return MACE_SUCCESS;
+}
+void ReduceOpBase::Validate() {
+#ifndef NDEBUG
+  const int32_t input_dim_size = GetInputShapeDimSize(INPUT);
+  const int32_t left = input_dim_size * -1;
+  const int32_t right = input_dim_size;
+  if (axis_size_) {
+    for (uint32_t i = 0; i < axis_size_; ++i) {
+      MACE_ASSERT1(axis_[i] > left && axis_[i] < right, "Axis is over range.");
+    }
+  }
+#endif
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/reduce.h
+++ b/micro/ops/reduce.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_REDUCE_H_
+#define MICRO_OPS_REDUCE_H_
+#include "micro/base/logging.h"
+#include "micro/base/types.h"
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+class ReduceOpBase : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+ public:
+  enum ReduceType {
+    MEAN = 0,
+    MIN = 1,
+    MAX = 2,
+    PROD = 3,
+    SUM = 4,
+  };
+ protected:
+  void Validate();
+ protected:
+  ReduceType reduce_type_;
+  const int32_t *axis_;
+  uint32_t axis_size_;
+  bool keep_dims_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+template<typename T>
+class ReduceOp : public ReduceOpBase {
+ public:
+  MaceStatus OnInit() {
+    input_ = GetInputData<T>(INPUT);
+    input_dims_ = GetInputShapeDims(INPUT);
+    input_dim_size_ = GetInputShapeDimSize(INPUT);
+    output_ = GetOutputData<T>(OUTPUT);
+    return ReduceOpBase::OnInit();
+  }
+  MaceStatus Run() {
+    Validate();
+    ScratchBuffer scratch_buffer(engine_config_);
+    bool *bitmap = scratch_buffer.GetBuffer<bool>(input_dim_size_);
+    int32_t *data_dims = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    uint32_t data_dim_size = 0;
+    int32_t *output_dims = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    uint32_t output_dim_size = 0;
+    Simplify(output_dims, &output_dim_size, bitmap,
+             input_dim_size_, data_dims, &data_dim_size);
+    MACE_RETURN_IF_ERROR(
+        ResizeOutputShape(OUTPUT, output_dim_size, output_dims));
+    const int32_t output_size =
+        base::GetShapeSize(output_dim_size, output_dims);
+    Compute(data_dims, data_dim_size, static_cast<uint32_t >(output_size));
+    return MACE_SUCCESS;
+  }
+ private:
+  void Simplify(int32_t *output_dims, uint32_t *output_dim_size,
+                bool *bitmap, int32_t bitmap_size,
+                int32_t *data_dims, uint32_t *data_dim_size) {
+    base::memset(bitmap, false, bitmap_size);
+    if (axis_size_ == 0) {
+      for (uint32_t i = 0; i < input_dim_size_; ++i) {
+        bitmap[i] = true;
+      }
+    } else {
+      for (uint32_t i = 0; i < axis_size_; ++i) {
+        int32_t index = axis_[i] >= 0 ? axis_[i] : axis_[i] + input_dim_size_;
+        DataFormat data_format = static_cast<DataFormat>(GetArgByName(
+            "data_format", static_cast<int32_t >(NHWC)));
+        if (data_format == NCHW &&
+            DataTypeToEnum<T>::value != DT_UINT8 && input_dim_size_ == 4) {
+          if (index == 1 || index == 2) {
+            index = index + 1;
+          } else if (index == 3) {
+            index = 1;
+          }
+        }
+        bitmap[index] = true;
+      }
+    }
+    uint32_t out_dim_idx = 0;
+    for (uint32_t i = 0; i < input_dim_size_; ++i) {
+      if (!bitmap[i]) {
+        output_dims[out_dim_idx++] = input_dims_[i];
+      } else if (keep_dims_) {
+        output_dims[out_dim_idx++] = 1;
+      }
+    }
+    *output_dim_size = out_dim_idx;
+    int32_t data_dims_idx = 0;
+    uint32_t dim_index = 0;
+    for (; dim_index < input_dim_size_; ++dim_index) {
+      if (input_dims_[dim_index] != 1) break;
+    }
+    if (dim_index >= input_dim_size_) {
+      reduce_first_axis_ = true;
+    } else {
+      reduce_first_axis_ = bitmap[dim_index];
+      data_dims[data_dims_idx++] = input_dims_[dim_index];
+      ++dim_index;
+      for (; dim_index < input_dim_size_; ++dim_index) {
+        const int32_t n = input_dims_[dim_index];
+        if (n == 1) {
+          bitmap[dim_index] = bitmap[dim_index - 1];
+        }
+        if (bitmap[dim_index - 1] != bitmap[dim_index]) {
+          data_dims[data_dims_idx++] = n;
+        } else {
+          data_dims[data_dims_idx - 1] *= n;
+        }
+      }
+    }
+    *data_dim_size = data_dims_idx;
+  }
+  void Reduce1Dims(ReduceType type, int32_t *data_reshape) {
+    if (reduce_first_axis_) {
+      if (type == MEAN) {
+        T tmp = 0;
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          tmp = tmp + input_[i];
+        }
+        output_[0] = tmp / data_reshape[0];
+      } else if (type == MIN) {
+        T tmp = input_[0];
+        for (int32_t i = 1; i < data_reshape[0]; ++i) {
+          tmp = base::min<T>(tmp, input_[i]);
+        }
+        output_[0] = tmp;
+      } else if (type == MAX) {
+        T tmp = input_[0];
+        for (int32_t i = 1; i < data_reshape[0]; ++i) {
+          tmp = base::max<T>(tmp, input_[i]);
+        }
+        output_[0] = tmp;
+      } else if (type == PROD) {
+        T tmp = input_[0];
+        for (int32_t i = 1; i < data_reshape[0]; ++i) {
+          tmp = tmp * input_[i];
+        }
+        output_[0] = tmp;
+      } else if (type == SUM) {
+        T tmp = 0;
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          tmp = tmp + input_[i];
+        }
+        output_[0] = tmp;
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    } else {
+      base::memcpy(output_, input_, data_reshape[0] * sizeof(T));
+    }
+  }
+  void Reduce2Dims(ReduceType type, int32_t *data_reshape) {
+    if (reduce_first_axis_) {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = 0;
+          for (int32_t j = 0; j < data_reshape[0]; ++j) {
+            tmp += input_[j * data_reshape[1] + i];
+          }
+          output_[i] = tmp / data_reshape[0];
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = input_[i];
+          for (int32_t j = 1; j < data_reshape[0]; ++j) {
+            tmp = base::min(tmp, input_[j * data_reshape[1] + i]);
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = input_[i];
+          for (int32_t j = 1; j < data_reshape[0]; ++j) {
+            tmp = base::max(tmp, input_[j * data_reshape[1] + i]);
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = input_[i];
+          for (int32_t j = 1; j < data_reshape[0]; ++j) {
+            tmp = tmp * input_[j * data_reshape[1] + i];
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = 0;
+          for (int32_t j = 0; j < data_reshape[0]; ++j) {
+            tmp += input_[j * data_reshape[1] + i];
+          }
+          output_[i] = tmp;
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          T tmp = 0;
+          for (int32_t j = 0; j < data_reshape[1]; ++j) {
+            tmp += input_[i * data_reshape[1] + j];
+          }
+          output_[i] = tmp / data_reshape[1];
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          T tmp = input_[i * data_reshape[1]];
+          for (int32_t j = 1; j < data_reshape[1]; ++j) {
+            tmp = base::min(tmp, input_[i * data_reshape[1] + j]);
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          T tmp = input_[i * data_reshape[1]];
+          for (int32_t j = 1; j < data_reshape[1]; ++j) {
+            tmp = base::max(tmp, input_[i * data_reshape[1] + j]);
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          T tmp = input_[i * data_reshape[1]];
+          for (int32_t j = 1; j < data_reshape[1]; ++j) {
+            tmp = tmp * input_[i * data_reshape[1] + j];
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          T tmp = 0;
+          for (int32_t j = 0; j < data_reshape[1]; ++j) {
+            tmp += input_[i * data_reshape[1] + j];
+          }
+          output_[i] = tmp;
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    }
+  }
+  void Reduce3Dims(ReduceType type, int32_t *data_reshape) {
+    if (reduce_first_axis_) {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[0]; ++k) {
+              output_[i] +=
+                  input_[(k * data_reshape[1] + i) * data_reshape[2]
+                      + j];
+            }
+          }
+          output_[i] /= (data_reshape[0] * data_reshape[2]);
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = input_[i * data_reshape[2]];
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[0]; ++k) {
+              tmp = base::min(
+                  tmp, input_[(k * data_reshape[1] + i) * data_reshape[2] + j]);
+            }
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = input_[i * data_reshape[2]];
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[0]; ++k) {
+              tmp = base::max(
+                  tmp, input_[(k * data_reshape[1] + i) * data_reshape[2] + j]);
+            }
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          T tmp = 1;
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[0]; ++k) {
+              tmp *= input_[(k * data_reshape[1] + i) * data_reshape[2] + j];
+            }
+          }
+          output_[i] = tmp;
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[0]; ++k) {
+              output_[i] +=
+                  input_[(k * data_reshape[1] + i) * data_reshape[2] + j];
+            }
+          }
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              output_[i * data_reshape[2] + j] +=
+                  input_[(i * data_reshape[1] + k) * data_reshape[2] + j];
+            }
+            output_[i * data_reshape[2] + j] /= data_reshape[1];
+          }
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = input_[i * data_reshape[1] * data_reshape[2] + j];
+            for (int32_t k = 1; k < data_reshape[1]; ++k) {
+              tmp = base::min(
+                  tmp, input_[(i * data_reshape[1] + k) * data_reshape[2] + j]);
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = input_[i * data_reshape[1] * data_reshape[2] + j];
+            for (int32_t k = 1; k < data_reshape[1]; ++k) {
+              tmp = base::max(
+                  tmp, input_[(i * data_reshape[1] + k) * data_reshape[2] + j]);
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = input_[i * data_reshape[1] * data_reshape[2] + j];
+            for (int32_t k = 1; k < data_reshape[1]; ++k) {
+              tmp *= input_[(i * data_reshape[1] + k) * data_reshape[2] + j];
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              output_[i * data_reshape[2] + j] +=
+                  input_[(i * data_reshape[1] + k) * data_reshape[2] + j];
+            }
+          }
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    }
+  }
+  void Reduce4Dims(ReduceType type, int32_t *data_reshape) {
+    if (reduce_first_axis_) {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[3]; ++j) {
+            for (int32_t k = 0; k < data_reshape[2]; ++k) {
+              for (int32_t t = 0; t < data_reshape[0]; ++t) {
+                output_[i * data_reshape[3] + j] +=
+                    input_[((t * data_reshape[1] + i) *
+                        data_reshape[2] + k) * data_reshape[3] + j];
+              }
+            }
+            output_[i * data_reshape[3] + j] /=
+                (data_reshape[0] * data_reshape[2]);
+          }
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[3]; ++j) {
+            T tmp = input_[i * data_reshape[2] * data_reshape[3] + j];
+            for (int32_t k = 0; k < data_reshape[2]; ++k) {
+              for (int32_t t = 0; t < data_reshape[0]; ++t) {
+                tmp = base::min(tmp,
+                                input_[((t * data_reshape[1] + i) *
+                                    data_reshape[2] + k) * data_reshape[3]
+                                    + j]);
+              }
+            }
+            output_[i * data_reshape[3] + j] = tmp;
+          }
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[3]; ++j) {
+            T tmp = input_[i * data_reshape[2] * data_reshape[3] + j];
+            for (int32_t k = 0; k < data_reshape[2]; ++k) {
+              for (int32_t t = 0; t < data_reshape[0]; ++t) {
+                tmp = base::max(tmp,  // NOLINT
+                                input_[((t * data_reshape[1] + i) *
+                                    data_reshape[2] + k) * data_reshape[3]
+                                    + j]);
+              }
+            }
+            output_[i * data_reshape[3] + j] = tmp;
+          }
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[3]; ++j) {
+            T tmp = 1;
+            for (int32_t k = 0; k < data_reshape[2]; ++k) {
+              for (int32_t t = 0; t < data_reshape[0]; ++t) {
+                tmp = tmp * input_[((t * data_reshape[1] + i) *
+                    data_reshape[2] + k) * data_reshape[3] + j];
+              }
+            }
+            output_[i * data_reshape[3] + j] = tmp;
+          }
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[1]; ++i) {
+          for (int32_t j = 0; j < data_reshape[3]; ++j) {
+            for (int32_t k = 0; k < data_reshape[2]; ++k) {
+              for (int32_t t = 0; t < data_reshape[0]; ++t) {
+                output_[i * data_reshape[3] + j] +=
+                    input_[((t * data_reshape[1] + i) *
+                        data_reshape[2] + k) * data_reshape[3] + j];
+              }
+            }
+          }
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (type == MEAN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              for (int32_t t = 0; t < data_reshape[3]; ++t) {
+                output_[i * data_reshape[2] + j] +=
+                    input_[((i * data_reshape[1] + k) *
+                        data_reshape[2] + j) * data_reshape[3] + t];
+              }
+            }
+            output_[i * data_reshape[2] + j] /=
+                (data_reshape[1] * data_reshape[3]);
+          }
+        }
+      } else if (type == MIN) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = input_[(i * data_reshape[1] *
+                data_reshape[2] + j) * data_reshape[3]];
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              for (int32_t t = 0; t < data_reshape[3]; ++t) {
+                tmp = base::min(
+                    tmp, input_[((i * data_reshape[1] + k) *
+                        data_reshape[2] + j) * data_reshape[3] + t]);
+              }
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == MAX) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = input_[(i * data_reshape[1] *
+                data_reshape[2] + j) * data_reshape[3]];
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              for (int32_t t = 0; t < data_reshape[3]; ++t) {
+                tmp = base::max(
+                    tmp, input_[((i * data_reshape[1] + k) *
+                        data_reshape[2] + j) * data_reshape[3] + t]);
+              }
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == PROD) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            T tmp = 1;
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              for (int32_t t = 0; t < data_reshape[3]; ++t) {
+                tmp = tmp * input_[((i * data_reshape[1] + k) *
+                    data_reshape[2] + j) * data_reshape[3] + t];
+              }
+            }
+            output_[i * data_reshape[2] + j] = tmp;
+          }
+        }
+      } else if (type == SUM) {
+        for (int32_t i = 0; i < data_reshape[0]; ++i) {
+          for (int32_t j = 0; j < data_reshape[2]; ++j) {
+            for (int32_t k = 0; k < data_reshape[1]; ++k) {
+              for (int32_t t = 0; t < data_reshape[3]; ++t) {
+                output_[i * data_reshape[2] + j] +=
+                    input_[((i * data_reshape[1] + k) *
+                        data_reshape[2] + j) * data_reshape[3] + t];
+              }
+            }
+          }
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    }
+  }
+  void Compute(int32_t *data_reshape,
+               uint32_t data_reshape_size, uint32_t output_size) {
+    base::memset(output_, static_cast<T>(0), output_size);
+    switch (data_reshape_size) {
+      case 1:Reduce1Dims(reduce_type_, data_reshape);
+        break;
+      case 2:Reduce2Dims(reduce_type_, data_reshape);
+        break;
+      case 3:Reduce3Dims(reduce_type_, data_reshape);
+        break;
+      case 4:Reduce4Dims(reduce_type_, data_reshape);
+        break;
+      default:LOG(FATAL) << "not implemented in mace"
+                         << "data reshape size" << data_reshape_size
+                         << "reduce first axis:" << reduce_first_axis_;
+        break;
+    }
+  }
+ private:
+  const T *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  T *output_;
+  bool reduce_first_axis_;
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_REDUCE_H_
--- a/micro/ops/reshape.cc
+++ b/micro/ops/reshape.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/reshape.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+namespace {
+MaceStatus ValidShapeData(const int32_t *input_dims,
+                          const uint32_t input_dim_size,
+                          int32_t *shape_data,
+                          const uint32_t shape_data_size) {
+  MACE_ASSERT(
+      input_dims != NULL && shape_data != NULL);
+  int32_t unknown_idx = -1;
+  int32_t product = 1;
+  const int32_t input_size = base::GetShapeSize(input_dim_size, input_dims);
+  for (uint32_t i = 0; i < shape_data_size; ++i) {
+    if (shape_data[i] == -1) {
+      MACE_ASSERT1(unknown_idx == -1, "Only one input size may be -1");
+      unknown_idx = i;
+      shape_data[i] = 1;
+    } else {
+      MACE_ASSERT2(shape_data[i] >= 0, "Shape must be non-negative: ",
+                   shape_data[i]);
+      if (shape_data[i] == 0) {
+        MACE_ASSERT1(i < input_dim_size, "dims:0 out of input dims' range.");
+        shape_data[i] = input_dims[i];
+      }
+      product *= shape_data[i];
+    }
+  }
+  if (unknown_idx != -1) {
+    MACE_ASSERT1(product != 0,
+                 "Cannot infer shape if there is zero shape size.");
+    const int32_t missing = input_size / product;
+    MACE_ASSERT1(missing * product == input_size,
+                 "Input size not match reshaped tensor size");
+    shape_data[unknown_idx] = missing;
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace
+MaceStatus ReshapeOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  shape_ = GetInputData<int32_t>(SHAPE);
+  shape_dims_ = GetInputShapeDims(SHAPE);
+  shape_dim_size_ = GetInputShapeDimSize(SHAPE);
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  return MACE_SUCCESS;
+}
+MaceStatus ReshapeOp::Run() {
+  const int32_t input_data_size =
+      base::GetShapeSize(input_dim_size_, input_dims_);
+  const int32_t shape_data_size =
+      base::GetShapeSize(shape_dim_size_, shape_dims_);
+  int32_t *shape_data =
+      ScratchBuffer(engine_config_).GetBuffer<int32_t>(shape_data_size);
+  base::memcpy(shape_data, shape_, shape_data_size * sizeof(int32_t));
+  MACE_RETURN_IF_ERROR(ValidShapeData(input_dims_, input_dim_size_,
+                                      shape_data, shape_data_size));
+#ifndef NDEBUG
+  const int32_t output_data_size = base::accumulate_multi(
+      shape_data, 0, static_cast<uint32_t>(shape_data_size));
+  if (input_data_size != output_data_size) {
+    LOG(FATAL) << "input_data_size(" << input_data_size
+               << ") != output_data_size(" << output_data_size
+               << "), please check the model.";
+  }
+#endif
+  // TODO(luxuhui): optimize this method by reusing buffer
+  base::memcpy(output_, input_, input_data_size * sizeof(mifloat));
+  return ResizeOutputShape(OUTPUT, shape_data_size, shape_data);
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/reshape.h
+++ b/micro/ops/reshape.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_RESHAPE_H_
+#define MICRO_OPS_RESHAPE_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class ReshapeOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  const int32_t *shape_;
+  const int32_t *shape_dims_;
+  uint32_t shape_dim_size_;
+  mifloat *output_;
+  MACE_OP_INPUT_TAGS(INPUT, SHAPE);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_RESHAPE_H_
--- a/micro/ops/shape.cc
+++ b/micro/ops/shape.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/shape.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+namespace micro {
+namespace ops {
+MaceStatus ShapeOp::OnInit() {
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  output_ = GetOutputData<int32_t>(OUTPUT);
+  return MACE_SUCCESS;
+}
+MaceStatus ShapeOp::Run() {
+  if (input_dim_size_ > 0) {
+    const int32_t out_put_dims[1] = {static_cast<int32_t>(input_dim_size_)};
+    MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, 1, out_put_dims));
+  } else {
+    ResizeOutputShape(OUTPUT, 0, NULL);
+  }
+  for (uint32_t i = 0; i < input_dim_size_; ++i) {
+    output_[i] = static_cast<int32_t>(input_dims_[i]);
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/shape.h
+++ b/micro/ops/shape.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_SHAPE_H_
+#define MICRO_OPS_SHAPE_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class ShapeOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  int32_t *output_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_SHAPE_H_
--- a/micro/ops/softmax.cc
+++ b/micro/ops/softmax.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/softmax.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+namespace micro {
+namespace ops {
+MaceStatus SoftmaxOp::OnInit() {
+  data_format_ = static_cast<DataFormat>(GetArgByName(
+      "data_format", static_cast<int32_t>(NHWC)));
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  MACE_ASSERT1(input_dim_size_ >= 2, "The input->dim_size() >= 2 failed.");
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  use_log_ = GetArgByName("use_log", false);
+  return MACE_SUCCESS;
+}
+MaceStatus SoftmaxOp::Run() {
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+  if (NHWC == data_format_) {  // NHWC
+    return RunForNHWC();
+  } else {
+    MACE_NOT_IMPLEMENTED;
+    return MACE_UNSUPPORTED;
+  }
+}
+MaceStatus SoftmaxOp::RunForNHWC() {
+  int32_t class_size = input_dims_[input_dim_size_ - 1];
+  int32_t hw_stride = class_size;
+  int32_t hw_size = base::accumulate_multi(input_dims_, 1, input_dim_size_);
+  int32_t batch_stride = hw_size;
+  int32_t batch_size = base::GetShapeSize(input_dim_size_, input_dims_);
+  float std_lowest = base::lowest();
+  for (int32_t b_offset = 0; b_offset < batch_size; b_offset += batch_stride) {
+    const mifloat *input_b_ptr = input_ + b_offset;
+    mifloat *output_b_ptr = output_ + b_offset;
+    for (int32_t k = 0; k < hw_size; k += hw_stride) {
+      const mifloat *input_ptr = input_b_ptr + k;
+      mifloat *output_ptr = output_b_ptr + k;
+      float max_val = std_lowest;
+      for (int32_t c = 0; c < class_size; ++c) {
+        max_val = base::max<float>(max_val, input_ptr[c]);  // NOLINT
+      }
+      float sum = 0;
+      for (int32_t c = 0; c < class_size; ++c) {
+        float exp_value = base::exp(input_ptr[c] - max_val);
+        sum += exp_value;
+        output_ptr[c] = exp_value;
+      }
+      if (use_log_) {
+        for (int32_t c = 0; c < class_size; ++c) {
+          float output_value = output_ptr[c];
+          output_value /= sum;
+          output_ptr[c] = base::log(output_value);
+        }
+      } else {
+        for (int32_t c = 0; c < class_size; ++c) {
+          output_ptr[c] = output_ptr[c] / sum;
+        }
+      }
+    }  // k
+  }  // b_offset
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/softmax.h
+++ b/micro/ops/softmax.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_SOFTMAX_H_
+#define MICRO_OPS_SOFTMAX_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class SoftmaxOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  MaceStatus RunForNHWC();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  mifloat *output_;
+  bool use_log_;
+  DataFormat data_format_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_SOFTMAX_H_
--- a/micro/ops/squeeze.cc
+++ b/micro/ops/squeeze.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/squeeze.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+MaceStatus SqueezeOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  MACE_ASSERT1(input_dim_size_ >= 2, "The input->dim_size() >= 2 failed.");
+  output_ = GetOutputData<mifloat>(OUTPUT);
+  const int32_t *axis = GetRepeatArgByName<int32_t>("axis", &axis_size_);
+  data_format_ = static_cast<DataFormat>(GetArgByName(
+      "data_format", static_cast<int32_t>(NHWC)));
+  ScratchBuffer scratch_buffer(engine_config_);
+  if (data_format_ == NCHW && input_dim_size_ == 4
+      && axis_size_ == 2 && axis[0] == 1 && axis[1] == 2) {
+    axis_ = scratch_buffer.GetBuffer<int32_t>(axis_size_);
+    base::memcpy(axis_, axis, axis_size_ * sizeof(int32_t));
+    axis_[0] = 2;
+    axis_[1] = 3;
+  } else {
+    axis_ = const_cast<int32_t *>(axis);
+  }
+  resize_shape_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+  return MACE_SUCCESS;
+}
+MaceStatus SqueezeOp::Run() {
+  int32_t resize_shape_idx = 0;
+  for (uint32_t i = 0; i < input_dim_size_; ++i) {
+    if (input_dims_[i] > 1) {
+      resize_shape_[resize_shape_idx++] = input_dims_[i];
+    } else if (axis_size_ > 0) {
+      bool exist_in_axis = false;
+      for (uint32_t k = 0; k < axis_size_; ++k) {
+        if (i == static_cast<uint32_t>(axis_[k])) {
+          exist_in_axis = true;
+          break;
+        }
+      }
+      if (!exist_in_axis) {
+        resize_shape_[resize_shape_idx++] = input_dims_[i];
+      }
+    }
+  }
+  // TODO(luxuhui): optimize this method by reusing buffer
+  const int32_t input_size = base::GetShapeSize(input_dim_size_, input_dims_);
+  base::memcpy(output_, input_, input_size * sizeof(mifloat));
+  return ResizeOutputShape(OUTPUT, resize_shape_idx, resize_shape_);
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/squeeze.h
+++ b/micro/ops/squeeze.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_SQUEEZE_H_
+#define MICRO_OPS_SQUEEZE_H_
+#include "micro/framework/operator.h"
+namespace micro {
+namespace ops {
+class SqueezeOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  mifloat *output_;
+  int32_t *axis_;
+  uint32_t axis_size_;
+  int32_t *resize_shape_;
+  DataFormat data_format_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_SQUEEZE_H_
--- a/micro/ops/stack.h
+++ b/micro/ops/stack.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_STACK_H_
+#define MICRO_OPS_STACK_H_
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+namespace micro {
+namespace ops {
+template<typename T>
+class StackOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() {
+    input_dims_ = GetInputShapeDims(INPUT);
+    input_dim_size_ = GetInputShapeDimSize(INPUT);
+    output_ = GetOutputData<T>(OUTPUT);
+    axis_ = GetArgByName("axis", static_cast<int32_t>(0));
+    const int32_t output_dim_size = static_cast<int32_t>(input_dim_size_) + 1;
+    MACE_ASSERT1(axis_ >= -output_dim_size && axis_ < output_dim_size,
+                 "axis out of bound.");
+    if (axis_ < 0) {
+      axis_ += output_dim_size;
+    }
+    return MACE_SUCCESS;
+  }
+  MaceStatus Run() {
+    const uint32_t inputs_size = GetInputSize();
+    MACE_ASSERT1(inputs_size > 0, "stack inputs are empty.");
+    int32_t output_dim_size = static_cast<int32_t>(input_dim_size_) + 1;
+    int32_t *output_dims =
+        ScratchBuffer(engine_config_).GetBuffer<int32_t>(output_dim_size);
+    for (int32_t i = 0; i < output_dim_size; ++i) {
+      if (i < axis_) {
+        output_dims[i] = input_dims_[i];
+      } else if (i == axis_) {
+        output_dims[i] = inputs_size;
+      } else {
+        output_dims[i] = input_dims_[i - 1];
+      }
+    }
+    ResizeOutputShape(OUTPUT, output_dim_size, output_dims);
+    int32_t high_dim_elem_size = base::accumulate_multi(input_dims_, 0, axis_);
+    int32_t low_dim_elem_size =
+        base::accumulate_multi(input_dims_, axis_, input_dim_size_);
+    T *output_data = output_;
+    for (int32_t h = 0; h < high_dim_elem_size; ++h) {
+      for (uint32_t i = 0; i < inputs_size; ++i) {
+        const T *input_data = GetInputData<T>(i);
+        base::memcpy(output_data, input_data + h * low_dim_elem_size,
+                     sizeof(T) * low_dim_elem_size);
+        output_data += low_dim_elem_size;
+      }
+    }
+    return MACE_SUCCESS;
+  }
+ private:
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  T *output_;
+  int32_t axis_;
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_STACK_H_
--- a/micro/ops/strided_slice.h
+++ b/micro/ops/strided_slice.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_STRIDED_SLICE_H_
+#define MICRO_OPS_STRIDED_SLICE_H_
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/include/utils/macros.h"
+namespace micro {
+namespace ops {
+template<typename T>
+class StridedSliceOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() {
+    MACE_RETURN_IF_ERROR(InitPrams());
+    return MACE_SUCCESS;
+  }
+  MaceStatus Run() {
+    MACE_RETURN_IF_ERROR(AdjustPrams());
+    MACE_RETURN_IF_ERROR(
+        ResizeOutputShape(OUTPUT, output_shape_idx_, output_shape_));
+    bool slice_by_first_axis = true;
+    if (strides_[0] != 1) {
+      slice_by_first_axis = false;
+    } else {
+      for (uint32_t d = 1; d < input_dim_size_; ++d) {
+        if (strides_[d] != 1 || begin_[d] != 0 ||
+            end_[d] != input_dims_[d]) {
+          slice_by_first_axis = false;
+          break;
+        }
+      }
+    }
+    if (slice_by_first_axis) {
+      base::memset(dim_stride_, static_cast<int32_t>(1), input_dim_size_);
+      for (int32_t d = input_dim_size_ - 2; d >= 0; --d) {
+        dim_stride_[d] = dim_stride_[d + 1] * input_dims_[d + 1];
+      }
+      base::memcpy(output_, input_ + begin_[0] * dim_stride_[0],
+                   sizeof(T) * (end_[0] - begin_[0]) * dim_stride_[0]);
+    } else {
+      if (input_dim_size_ == 1) {
+        for (int32_t i = begin_[0];
+             strides_[0] > 0 ? i < end_[0] : i > end_[0]; i += strides_[0]) {
+          *output_++ = input_[i];
+        }
+      } else if (input_dim_size_ == 2) {
+        for (int32_t i = begin_[0];
+             strides_[0] > 0 ? i < end_[0] : i > end_[0]; i += strides_[0]) {
+          for (int32_t j = begin_[1];
+               strides_[1] > 0 ? j < end_[1] : j > end_[1]; j += strides_[1]) {
+            *output_++ = input_[i * input_dims_[1] + j];
+          }
+        }
+      } else if (input_dim_size_ == 3) {
+        for (int32_t i = begin_[0];
+             strides_[0] > 0 ? i < end_[0] : i > end_[0]; i += strides_[0]) {
+          for (int32_t j = begin_[1];
+               strides_[1] > 0 ? j < end_[1] : j > end_[1]; j += strides_[1]) {
+            for (int32_t k = begin_[2];
+                 strides_[2] > 0 ? k < end_[2] : k > end_[2];
+                 k += strides_[2]) {
+              *output_++ =
+                  input_[(i * input_dims_[1] + j) * input_dims_[2] + k];
+            }
+          }
+        }
+      } else if (input_dim_size_ == 4) {
+        for (int32_t i = begin_[0];
+             strides_[0] > 0 ? i < end_[0] : i > end_[0]; i += strides_[0]) {
+          for (int32_t j = begin_[1];
+               strides_[1] > 0 ? j < end_[1] : j > end_[1]; j += strides_[1]) {
+            for (int32_t k = begin_[2];
+                 strides_[2] > 0 ? k < end_[2] : k > end_[2];
+                 k += strides_[2]) {
+              for (int32_t l = begin_[3];
+                   strides_[3] > 0 ? l < end_[3] : l > end_[3];
+                   l += strides_[3]) {
+                int32_t input_base =
+                    (i * input_dims_[1] + j) * input_dims_[2] + k;
+                int32_t input_idx = input_base * input_dims_[3] + l;
+                *output_++ = input_[input_idx];
+              }
+            }
+          }
+        }
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+    }
+    return MACE_SUCCESS;
+  }
+ private:
+  MaceStatus InitPrams() {
+    input_ = GetInputData<T>(INPUT);
+    input_dims_ = GetInputShapeDims(INPUT);
+    input_dim_size_ = GetInputShapeDimSize(INPUT);
+    MACE_ASSERT1(input_dim_size_ > 0 && input_dim_size_ <= 4,
+                 "The input dims should be an integer in (0, 4].");
+    ScratchBuffer scratch_buffer(engine_config_);
+    begin_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    end_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    strides_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    output_shape_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    dim_stride_ = scratch_buffer.GetBuffer<int32_t>(input_dim_size_);
+    base::memset(begin_, static_cast<int32_t>(0), input_dim_size_);
+    base::memset(end_, static_cast<int32_t>(0), input_dim_size_);
+    base::memset(strides_, static_cast<int32_t>(1), input_dim_size_);
+    begin_dims_ = GetInputShapeDims(BEGIN);
+    end_dims_ = GetInputShapeDims(END);
+    MACE_ASSERT1(
+        GetInputShapeDimSize(BEGIN) == 1 && GetInputShapeDimSize(END) == 1,
+        "Expected begin, end, and to be 1D tensor");
+    output_ = GetOutputData<T>(OUTPUT);
+    begin_mask_ = GetArgByName("begin_mask", static_cast<int32_t>(0));
+    end_mask_ = GetArgByName("end_mask", static_cast<int32_t>(0));
+    ellipsis_mask_ = GetArgByName("ellipsis_mask", static_cast<int32_t>(0));
+    new_axis_mask_ = GetArgByName("new_axis_mask", static_cast<int32_t>(0));
+    shrink_axis_mask_ =
+        GetArgByName("shrink_axis_mask", static_cast<int32_t>(0));
+    is_slice_ = GetArgByName("slice", false);
+    MACE_ASSERT1(ellipsis_mask_ == 0 && new_axis_mask_ == 0,
+                 "ellipsis_mask and new_axis_mask are not supported yet.");
+    return MACE_SUCCESS;
+  }
+  int32_t FormatIndices(const int32_t (&valid_range)[2],
+                        const int32_t dim_len, int32_t indice) {
+    int32_t forward = indice < 0 ? indice + dim_len : indice;
+    return base::clamp(forward, valid_range[0], valid_range[1]);
+  }
+  MaceStatus AdjustPrams() {
+    const int32_t *begin = GetInputData<int32_t>(BEGIN);
+    base::memcpy(begin_, begin, begin_dims_[0] * sizeof(int32_t));
+    const int32_t *end = GetInputData<int32_t>(END);
+    base::memcpy(end_, end, end_dims_[0] * sizeof(int32_t));
+    const int32_t *strides = NULL;
+    if (GetInputSize() > 3) {
+      strides = GetInputData<int32_t>(STRIDES);
+      strides_dims_ = GetInputShapeDims(STRIDES);
+    }
+    if (strides == NULL) {
+      base::memset(strides_, static_cast<int32_t>(1), input_dim_size_);
+      strides_dims_ = begin_dims_;
+    } else {
+      base::memcpy(strides_, strides, strides_dims_[0] * sizeof(int32_t));
+    }
+    output_shape_idx_ = 0;
+    const uint32_t begin_size = static_cast<uint32_t>(begin_dims_[0]);
+    MACE_UNUSED(begin_size);
+    const uint32_t end_size = static_cast<uint32_t>(end_dims_[0]);
+    if (is_slice_) {
+      MACE_ASSERT1(begin_size == input_dim_size_ && end_size == input_dim_size_,
+                   "In slice, begin and size elements num should be equal");
+      for (uint32_t i = 0; i < input_dim_size_; ++i) {
+        if (end_[i] == -1) {
+          end_[i] = input_dims_[i] - begin_[i];
+        }
+      }
+      for (uint32_t i = 0; i < input_dim_size_; ++i) {
+        int32_t b = begin_[i];
+        int32_t s = end_[i];
+#ifndef NDEBUG
+        int32_t input_i = input_dims_[i];
+        if (!(0 <= b && b <= input_i)) {
+          LOG(FATAL) << "In Slice, expected begin[" << i << "] in [0, "
+                     << input_i << "], but got " << b;
+        }
+        if (!(0 <= s && b + s <= input_i)) {
+          LOG(FATAL) << "In Slice, expected size[" << i << "] in [0, "
+                     << input_i - b << "], but got" << s;
+        }
+#endif
+        end_[i] = b + s;
+        output_shape_[output_shape_idx_++] = s;
+      }
+    } else {
+      const uint32_t strides_size = static_cast<uint32_t>(strides_dims_[0]);
+      MACE_ASSERT2(begin_size == end_size && end_size == strides_size,
+                   "In strided_slice, expected begin, end, and strides to be",
+                   " equal size tensors");
+      for (uint32_t i = 0; i < strides_size; ++i) {
+        MACE_ASSERT1(strides_[i] != 0, "strides data cannot be 0!");
+      }
+      // pad
+      for (uint32_t i = end_size; i < input_dim_size_; ++i) {
+        end_[i] = input_dims_[i];
+      }
+      // mask and shrink
+      for (uint32_t d = 0; d < input_dim_size_; ++d) {
+        int32_t dim_len = input_dims_[d];
+        const int32_t valid_range[] = {strides_[d] > 0 ? 0 : -1,
+                                       strides_[d] > 0 ? dim_len : dim_len - 1};
+        if (!(shrink_axis_mask_ & (1 << d))) {
+          if (begin_mask_ & (1 << d)) {
+            begin_[d] = strides_[d] > 0 ? 0 : dim_len - 1;
+          } else {
+            begin_[d] = FormatIndices(valid_range, dim_len, begin_[d]);
+          }
+          if (end_mask_ & (1 << d)) {
+            end_[d] = strides_[d] > 0 ? dim_len : -1;
+          } else {
+            end_[d] = FormatIndices(valid_range, dim_len, end_[d]);
+          }
+          int32_t out_dim_len = base::max(
+              static_cast<int32_t>(0), base::ceil((end_[d] - begin_[d]) /
+                  static_cast<float>(strides_[d])));
+          output_shape_[output_shape_idx_++] = out_dim_len;
+        } else {
+          begin_[d] = begin_[d] < 0 ? begin_[d] + dim_len : begin_[d];
+          end_[d] = begin_[d] + 1;
+#ifndef NDEBUG
+          if (!(begin_[d] >= 0 && begin_[d] < dim_len)) {
+            LOG(FATAL) << "slice begin indice of dimension '" << d << "': "
+                       << begin_[d] << ", is out of bound";
+          }
+#endif
+        }
+      }
+    }
+#ifndef NDEBUG
+    for (uint32_t i = 0; i < output_shape_idx_; ++i) {
+      if (output_shape_[i] <= 0) {
+        LOG(FATAL) << "Expected output_shape[" << i
+                   << "] larger than 0, but got " << output_shape_[i];
+      }
+    }
+#endif
+    return MACE_SUCCESS;
+  }
+ private:
+  const T *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+  int32_t *begin_;
+  const int32_t *begin_dims_;
+  int32_t *end_;
+  const int32_t *end_dims_;
+  int32_t *strides_;
+  const int32_t *strides_dims_;
+  T *output_;
+  int32_t *output_shape_;
+  uint32_t output_shape_idx_;
+  int32_t *dim_stride_;
+  int32_t begin_mask_;
+  int32_t end_mask_;
+  int32_t ellipsis_mask_;
+  int32_t new_axis_mask_;
+  int32_t shrink_axis_mask_;
+  bool is_slice_;
+  MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_STRIDED_SLICE_H_
--- a/micro/ops/utils/activation.cc
+++ b/micro/ops/utils/activation.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/utils/activation.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/model/argument.h"
+namespace micro {
+namespace ops {
+Activation::Activation() : type_(TYPE_COUNT) {}
+MaceStatus Activation::Init(const framework::Operator *op) {
+  const char *atcivation_type = reinterpret_cast<const char *>(
+      op->GetRepeatArgByName<uint8_t>("activation"));
+  if (atcivation_type == NULL) {
+    atcivation_type = "NOOP";
+  }
+  const float max_limit = op->GetArgByName("max_limit", 0.0f);
+  const float leakyrelu_coefficient =
+      op->GetArgByName("leakyrelu_coefficient", 0.0f);
+  return Init(atcivation_type, max_limit, leakyrelu_coefficient);
+}
+MaceStatus Activation::Init(const char *type, const float limit,
+                            const float leakyrelu_coefficient) {
+  type_ = StringToActivationType(type);
+  limit_ = limit;
+  leakyrelu_coefficient_ = leakyrelu_coefficient;
+  return MACE_SUCCESS;
+}
+ActivationType Activation::GetActivationType() {
+  MACE_ASSERT1(type_ != TYPE_COUNT, "Activation should init first.");
+  return type_;
+}
+MaceStatus Activation::Compute(const mifloat *input_ptr,
+                               const int32_t size, mifloat *output_ptr) {
+  MACE_ASSERT1(type_ != TYPE_COUNT, "Activation should init first.");
+  switch (type_) {
+    case RELU: {
+      for (int32_t i = 0; i < size; ++i) {
+        *output_ptr++ = base::max<float>(0.f, *input_ptr++);
+      }
+      break;
+    }
+    case RELUX: {
+      for (int32_t i = 0; i < size; ++i) {
+        *output_ptr++ = base::max(0.f, base::min<float>(limit_, *input_ptr++));
+      }
+      break;
+    }
+    case LEAKYRELU: {
+      for (int32_t i = 0; i < size; ++i) {
+        float input = *input_ptr;
+        *output_ptr = base::max(input, 0.f) +
+            base::min(input, 0.f) * leakyrelu_coefficient_;  // NOLINT
+        ++input_ptr;
+        ++output_ptr;
+      }
+      break;
+    }
+    case TANH: {
+      for (int32_t i = 0; i < size; ++i) {
+        *output_ptr++ = base::tanh(*input_ptr++);
+      }
+      break;
+    }
+    case SIGMOID: {
+      for (int32_t i = 0; i < size; ++i) {
+        *output_ptr++ = 1 / (1 + base::exp(-(*input_ptr++)));
+      }
+      break;
+    }
+    case NOOP: {
+      break;
+    }
+    default: {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  return MACE_SUCCESS;
+}
+ActivationType Activation::StringToActivationType(const char *type) {
+  if (base::strcmp(type, "RELU") == 0) {
+    return RELU;
+  } else if (base::strcmp(type, "RELUX") == 0) {
+    return RELUX;
+  } else if (base::strcmp(type, "PRELU") == 0) {
+    return PRELU;
+  } else if (base::strcmp(type, "TANH") == 0) {
+    return TANH;
+  } else if (base::strcmp(type, "SIGMOID") == 0) {
+    return SIGMOID;
+  } else if (base::strcmp(type, "NOOP") == 0) {
+    return NOOP;
+  } else if (base::strcmp(type, "LEAKYRELU") == 0) {
+    return LEAKYRELU;
+  } else {
+    LOG(FATAL) << "Unknown activation type: " << type;
+  }
+  return NOOP;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/utils/activation.h
+++ b/micro/ops/utils/activation.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_UTILS_ACTIVATION_H_
+#define MICRO_OPS_UTILS_ACTIVATION_H_
+#include "micro/base/types.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace framework {
+class Operator;
+}  // namespace framework
+namespace ops {
+enum ActivationType {
+  NOOP = 0,
+  RELU = 1,
+  RELUX = 2,
+  PRELU = 3,
+  TANH = 4,
+  SIGMOID = 5,
+  LEAKYRELU = 6,
+  TYPE_COUNT,
+};
+class Activation {
+ public:
+  Activation();
+  ~Activation() {}
+  MaceStatus Init(const framework::Operator *op);
+  MaceStatus Init(const char *type, const float limit,
+                  const float leakyrelu_coefficient);
+  MaceStatus Compute(const mifloat *input_ptr,
+                     const int32_t size, mifloat *output_ptr);
+  ActivationType GetActivationType();
+ private:
+  ActivationType StringToActivationType(const char *type);
+ private:
+  ActivationType type_;
+  float limit_;
+  float leakyrelu_coefficient_;
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_UTILS_ACTIVATION_H_
--- a/micro/ops/utils/crumb_utils.cc
+++ b/micro/ops/utils/crumb_utils.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/utils/crumb_utils.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+namespace micro {
+namespace ops {
+namespace crumb {
+MaceStatus ComputeBias(const mifloat *input, const int32_t *input_dims,
+                       const uint32_t input_dim_size, const mifloat *bias,
+                       const int32_t channel, mifloat *output) {
+  MACE_ASSERT(input != NULL && input_dims != NULL && input_dim_size > 0
+                  && bias != NULL && channel > 0 && output != NULL);
+  const int32_t outer_size =
+      base::accumulate_multi(input_dims, 0, input_dim_size - 1);
+  for (int32_t i = 0; i < outer_size; ++i) {
+    const int32_t outer_base = i * channel;
+    for (int32_t c = 0; c < channel; ++c) {
+      const int32_t idx = outer_base + c;
+      output[idx] = input[idx] + bias[c];
+    }
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace crumb
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/utils/crumb_utils.h
+++ b/micro/ops/utils/crumb_utils.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_UTILS_CRUMB_UTILS_H_
+#define MICRO_OPS_UTILS_CRUMB_UTILS_H_
+#include "micro/base/types.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace ops {
+namespace crumb {
+MaceStatus ComputeBias(const mifloat *input, const int32_t *input_dims,
+                       const uint32_t input_dim_size,
+                       const mifloat *bias, const int32_t channel,
+                       mifloat *output);
+}  // crumb
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_UTILS_CRUMB_UTILS_H_
--- a/micro/ops/utils/gemm.cc
+++ b/micro/ops/utils/gemm.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/utils/gemm.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+#ifndef MICRO_NOT_OPT
+MaceStatus Gemm<mifloat>::Compute(const mifloat *lhs_data,
+                                  const mifloat *rhs_data,
+                                  const int32_t batch,
+                                  const int32_t rows,
+                                  const int32_t cols,
+                                  const int32_t depth,
+                                  const MatrixMajor lhs_major,
+                                  const MatrixMajor rhs_major,
+                                  const MatrixMajor output_major,
+                                  const bool lhs_batched,
+                                  const bool rhs_batched,
+                                  mifloat *output_data) {
+  for (int32_t b = 0; b < batch; ++b) {
+    MatrixMap<const mifloat>
+        lhs_matrix
+        (lhs_data + static_cast<int32_t>(lhs_batched) * b * rows * depth,
+         lhs_major,
+         rows,
+         depth);
+    MatrixMap<const mifloat>
+        rhs_matrix
+        (rhs_data + static_cast<int32_t>(rhs_batched) * b * depth * cols,
+         rhs_major,
+         depth,
+         cols);
+    MatrixMap<mifloat>
+        output_matrix(output_data + b * rows * cols, output_major, rows, cols);
+    const int32_t rows_4 = rows / 4 * 4;
+    const int32_t cols_4 = cols / 4 * 4;
+    for (int32_t r = 0; r < rows; r += 4) {
+      if (r < rows_4) {
+        int32_t ro[4] = {r, r + 1, r + 2, r + 3};
+        for (int32_t c = 0; c < cols; c += 4) {
+          if (c < cols_4) {
+            float sum[16] = {0};
+            int32_t co[4] = {c, c + 1, c + 2, c + 3};
+            for (int32_t d = 0; d < depth; ++d) {
+              float lhs0 = lhs_matrix(ro[0], d);
+              float lhs1 = lhs_matrix(ro[1], d);
+              float lhs2 = lhs_matrix(ro[2], d);
+              float lhs3 = lhs_matrix(ro[3], d);
+              float rhs0 = rhs_matrix(d, co[0]);
+              float rhs1 = rhs_matrix(d, co[1]);
+              float rhs2 = rhs_matrix(d, co[2]);
+              float rhs3 = rhs_matrix(d, co[3]);
+              sum[0] += lhs0 * rhs0;
+              sum[1] += lhs0 * rhs1;
+              sum[2] += lhs0 * rhs2;
+              sum[3] += lhs0 * rhs3;
+              sum[4] += lhs1 * rhs0;
+              sum[5] += lhs1 * rhs1;
+              sum[6] += lhs1 * rhs2;
+              sum[7] += lhs1 * rhs3;
+              sum[8] += lhs2 * rhs0;
+              sum[9] += lhs2 * rhs1;
+              sum[10] += lhs2 * rhs2;
+              sum[11] += lhs2 * rhs3;
+              sum[12] += lhs3 * rhs0;
+              sum[13] += lhs3 * rhs1;
+              sum[14] += lhs3 * rhs2;
+              sum[15] += lhs3 * rhs3;
+            }  // d
+            for (int32_t ro_i = 0; ro_i < 4; ++ro_i) {
+              int32_t ro_i_base = ro_i * 4;
+              for (int32_t co_i = 0; co_i < 4; ++co_i) {
+                *output_matrix.data(ro[ro_i], co[co_i]) = sum[ro_i_base + co_i];
+              }
+            }
+          } else {
+            for (int32_t ro = r; ro < r + 4; ++ro) {
+              for (int32_t co = cols_4; co < cols; ++co) {
+                float sum = 0;
+                for (int32_t d = 0; d < depth; ++d) {
+                  sum += lhs_matrix(ro, d) * rhs_matrix(d, co);
+                }  // d
+                *output_matrix.data(ro, co) = sum;
+              }
+            }
+          }
+        }  // c
+      } else {
+        for (int32_t ro = rows_4; ro < rows; ++ro) {
+          for (int32_t c = 0; c < cols; ++c) {
+            float sum = 0;
+            for (int32_t d = 0; d < depth; ++d) {
+              sum += lhs_matrix(ro, d) * rhs_matrix(d, c);
+            }  // d
+            *output_matrix.data(ro, c) = sum;
+          }  // c
+        }
+      }
+    }  // r
+  }   // b
+  return MACE_SUCCESS;
+}
+#else
+MaceStatus Gemm<mifloat>::Compute(const mifloat *lhs_data,
+                                  const mifloat *rhs_data,
+                                  const int32_t batch,
+                                  const int32_t rows,
+                                  const int32_t cols,
+                                  const int32_t depth,
+                                  const MatrixMajor lhs_major,
+                                  const MatrixMajor rhs_major,
+                                  const MatrixMajor output_major,
+                                  const bool lhs_batched,
+                                  const bool rhs_batched,
+                                  mifloat *output_data) {
+  for (int32_t b = 0; b < batch; ++b) {
+    MatrixMap<const mifloat>
+        lhs_matrix
+        (lhs_data + static_cast<int32_t>(lhs_batched) * b * rows * depth,
+         lhs_major,
+         rows,
+         depth);
+    MatrixMap<const mifloat>
+        rhs_matrix
+        (rhs_data + static_cast<int32_t>(rhs_batched) * b * depth * cols,
+         rhs_major,
+         depth,
+         cols);
+    MatrixMap<mifloat>
+        output_matrix(output_data + b * rows * cols, output_major, rows, cols);
+    for (int32_t r = 0; r < rows; ++r) {
+      for (int32_t c = 0; c < cols; ++c) {
+        float sum = 0;
+        for (int32_t d = 0; d < depth; ++d) {
+          sum += lhs_matrix(r, d) * rhs_matrix(d, c);
+        }  // d
+        *output_matrix.data(r, c) = sum;
+      }  // c
+    }  // r
+  }   // b
+  return MACE_SUCCESS;
+}
+#endif
+MaceStatus Gemm<mifloat>::Compute(const mifloat *lhs,
+                                  const mifloat *rhs,
+                                  const int32_t batch,
+                                  const int32_t lhs_rows,
+                                  const int32_t lhs_cols,
+                                  const int32_t rhs_rows,
+                                  const int32_t rhs_cols,
+                                  const bool transpose_lhs,
+                                  const bool transpose_rhs,
+                                  const bool transpose_out,
+                                  const bool lhs_batched,
+                                  const bool rhs_batched,
+                                  mifloat *output_data) {
+  int32_t rows = transpose_lhs ? lhs_cols : lhs_rows;
+  int32_t depth = transpose_lhs ? lhs_rows : lhs_cols;
+  int32_t cols = transpose_rhs ? rhs_rows : rhs_cols;
+  MACE_ASSERT1(depth == (transpose_rhs ? rhs_cols : rhs_rows),
+               "Matrices that multiply have inconsistent depth dim: ");
+  return Compute(lhs,
+                 rhs,
+                 batch,
+                 rows,
+                 cols,
+                 depth,
+                 transpose_lhs ? ColMajor : RowMajor,
+                 transpose_rhs ? ColMajor : RowMajor,
+                 transpose_out ? ColMajor : RowMajor,
+                 lhs_batched,
+                 rhs_batched,
+                 output_data);
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/utils/gemm.h
+++ b/micro/ops/utils/gemm.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_UTILS_GEMM_H_
+#define MICRO_OPS_UTILS_GEMM_H_
+#include "micro/base/types.h"
+#include "micro/include/public/micro.h"
+#include "micro/ops/utils/matrix.h"
+namespace micro {
+namespace ops {
+template<typename T>
+class Gemm {
+ public:
+  Gemm() {}
+  ~Gemm() {}
+  MaceStatus Compute(const mifloat *lhs_data,
+                     const mifloat *rhs_data,
+                     const int32_t batch,
+                     const int32_t rows,
+                     const int32_t cols,
+                     const int32_t depth,
+                     const MatrixMajor lhs_major,
+                     const MatrixMajor rhs_major,
+                     const MatrixMajor output_major,
+                     const bool lhs_batched,
+                     const bool rhs_batched,
+                     T *output_data);
+};
+template<>
+class Gemm<mifloat> {
+ public:
+  Gemm() {}
+  ~Gemm() {}
+  MaceStatus Compute(const mifloat *lhs_data,
+                     const mifloat *rhs_data,
+                     const int32_t batch,
+                     const int32_t rows,
+                     const int32_t cols,
+                     const int32_t depth,
+                     const MatrixMajor lhs_major,
+                     const MatrixMajor rhs_major,
+                     const MatrixMajor output_major,
+                     const bool lhs_batched,
+                     const bool rhs_batched,
+                     mifloat *output_data);
+  // Original matrix before transpose has row-major
+  MaceStatus Compute(
+      const mifloat *lhs_data,
+      const mifloat *rhs_data,
+      const int32_t batch,
+      const int32_t lhs_rows,
+      const int32_t lhs_cols,
+      const int32_t rhs_rows,
+      const int32_t rhs_cols,
+      const bool transpose_lhs,
+      const bool transpose_rhs,
+      const bool transpose_out,
+      const bool lhs_batched,
+      const bool rhs_batched,
+      mifloat *output_data);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_UTILS_GEMM_H_
--- a/micro/ops/utils/gemv.cc
+++ b/micro/ops/utils/gemv.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/utils/gemv.h"
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+MaceStatus Gemv<mifloat>::Compute(const mifloat *lhs_data,
+                                  const mifloat *rhs_data,
+                                  const mifloat *bias_data,
+                                  const int32_t batch,
+                                  const int32_t lhs_height,
+                                  const int32_t lhs_width,
+                                  const bool lhs_batched,
+                                  const bool rhs_batched,
+                                  mifloat *output_data) {
+  if (lhs_height == 1) {
+    for (int32_t b = 0; b < batch; ++b) {
+      const int32_t lhs_b_base = static_cast<int32_t>(lhs_batched) * b;
+      const int32_t rhs_b_base =
+          static_cast<int32_t>(rhs_batched) * b * lhs_width;
+      float sum = bias_data != NULL ? bias_data[0] : 0.0f;
+      const int32_t lhs_h_base = lhs_b_base * lhs_width;
+      for (int32_t w = 0; w < lhs_width; ++w) {
+        sum += lhs_data[lhs_h_base + w] * rhs_data[rhs_b_base + w];
+      }  // w
+      output_data[lhs_b_base] = sum;
+    }   // b
+  } else if (lhs_height == 2) {
+    for (int32_t b = 0; b < batch; ++b) {
+      const int32_t lhs_b_base =
+          static_cast<int32_t>(lhs_batched) * b * 2;
+      const int32_t rhs_b_base =
+          static_cast<int32_t>(rhs_batched) * b * lhs_width;
+      float sum0 = bias_data != NULL ? bias_data[0] : 0.0f;
+      float sum1 = bias_data != NULL ? bias_data[1] : 0.0f;
+      const int32_t lhs_h_base0 = lhs_b_base * lhs_width;
+      const int32_t lhs_h_base1 = lhs_h_base0 + lhs_width;
+      for (int32_t w = 0; w < lhs_width; ++w) {
+        float rhs_data_value = rhs_data[rhs_b_base + w];
+        sum0 += lhs_data[lhs_h_base0 + w] * rhs_data_value;
+        sum1 += lhs_data[lhs_h_base1 + w] * rhs_data_value;
+      }  // w
+      output_data[lhs_b_base] = sum0;
+      output_data[lhs_b_base + 1] = sum1;
+    }   // b
+  } else if (lhs_height == 3) {
+    for (int32_t b = 0; b < batch; ++b) {
+      const int32_t lhs_b_base =
+          static_cast<int32_t>(lhs_batched) * b * 2;
+      const int32_t rhs_b_base =
+          static_cast<int32_t>(rhs_batched) * b * lhs_width;
+      float sum0 = bias_data != NULL ? bias_data[0] : 0.0f;
+      float sum1 = bias_data != NULL ? bias_data[1] : 0.0f;
+      float sum2 = bias_data != NULL ? bias_data[2] : 0.0f;
+      const int32_t lhs_h_base0 = lhs_b_base * lhs_width;
+      const int32_t lhs_h_base1 = lhs_h_base0 + lhs_width;
+      const int32_t lhs_h_base2 = lhs_h_base1 + lhs_width;
+      for (int32_t w = 0; w < lhs_width; ++w) {
+        float rhs_data_value = rhs_data[rhs_b_base + w];
+        sum0 += lhs_data[lhs_h_base0 + w] * rhs_data_value;
+        sum1 += lhs_data[lhs_h_base1 + w] * rhs_data_value;
+        sum2 += lhs_data[lhs_h_base2 + w] * rhs_data_value;
+      }  // w
+      output_data[lhs_b_base] = sum0;
+      output_data[lhs_b_base + 1] = sum1;
+      output_data[lhs_b_base + 2] = sum2;
+    }   // b
+  } else {  // lhs_height >= 4
+    int32_t lhs_height_end = lhs_height - 4;
+    for (int32_t b = 0; b < batch; ++b) {
+      const int32_t lhs_b_base =
+          static_cast<int32_t>(lhs_batched) * b * lhs_height;
+      const int32_t rhs_b_base =
+          static_cast<int32_t>(rhs_batched) * b * lhs_width;
+      for (int32_t h = 0; h < lhs_height; h += 4) {
+        if (h > lhs_height_end) {
+          h = lhs_height_end;
+        }
+        float sum0 = 0;
+        float sum1 = 0;
+        float sum2 = 0;
+        float sum3 = 0;
+        if (bias_data != NULL) {
+          sum0 = bias_data[0];
+          sum1 = bias_data[1];
+          sum2 = bias_data[2];
+          sum3 = bias_data[3];
+        }
+        const int32_t lhs_h_base0 = (lhs_b_base + h) * lhs_width;
+        const int32_t lhs_h_base1 = lhs_h_base0 + lhs_width;
+        const int32_t lhs_h_base2 = lhs_h_base1 + lhs_width;
+        const int32_t lhs_h_base3 = lhs_h_base2 + lhs_width;
+        for (int32_t w = 0; w < lhs_width; ++w) {
+          float rhs_data_value = rhs_data[rhs_b_base + w];
+          sum0 += lhs_data[lhs_h_base0 + w] * rhs_data_value;
+          sum1 += lhs_data[lhs_h_base1 + w] * rhs_data_value;
+          sum2 += lhs_data[lhs_h_base2 + w] * rhs_data_value;
+          sum3 += lhs_data[lhs_h_base3 + w] * rhs_data_value;
+        }  // w
+        output_data[lhs_b_base + h] = sum0;
+        output_data[lhs_b_base + h + 1] = sum1;
+        output_data[lhs_b_base + h + 2] = sum2;
+        output_data[lhs_b_base + h + 3] = sum3;
+      }  // h
+    }   // b
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/utils/gemv.h
+++ b/micro/ops/utils/gemv.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_UTILS_GEMV_H_
+#define MICRO_OPS_UTILS_GEMV_H_
+#include "micro/base/types.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace ops {
+template<typename T>
+class Gemv {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const T *lhs_data,
+      const T *rhs_data,
+      const T *bias_data,
+      const int32_t batch,
+      const int32_t lhs_height,
+      const int32_t lhs_width,
+      const bool lhs_batched,
+      const bool rhs_batched,
+      T *output_data);
+};
+template<>
+class Gemv<mifloat> {
+ public:
+  Gemv() {}
+  ~Gemv() {}
+  // Always row-major after transpose
+  MaceStatus Compute(
+      const mifloat *lhs_data,
+      const mifloat *rhs_data,
+      const mifloat *bias_data,
+      const int32_t batch,
+      const int32_t lhs_height,
+      const int32_t lhs_width,
+      const bool lhs_batched,
+      const bool rhs_batched,
+      mifloat *output_data);
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_UTILS_GEMV_H_
--- a/micro/ops/utils/matrix.h
+++ b/micro/ops/utils/matrix.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_OPS_UTILS_MATRIX_H_
+#define MICRO_OPS_UTILS_MATRIX_H_
+#include "micro/base/logging.h"
+namespace micro {
+namespace ops {
+enum MatrixMajor {
+  RowMajor,
+  ColMajor
+};
+inline MatrixMajor TransposeMatrixMajor(const MatrixMajor src_major) {
+  return src_major == RowMajor ? ColMajor : RowMajor;
+}
+template<typename T>
+class MatrixMap {
+ public:
+  MatrixMap()
+      : data_(NULL),
+        matrix_major_(RowMajor),
+        rows_(0),
+        cols_(0),
+        stride_(0) {}
+  MatrixMap(T *data,
+            const MatrixMajor matrix_major,
+            const int32_t rows,
+            const int32_t cols) :
+      data_(data),
+      matrix_major_(matrix_major),
+      rows_(rows),
+      cols_(cols),
+      stride_(matrix_major == ColMajor ? rows : cols) {}
+  MatrixMap(T *data,
+            const MatrixMajor matrix_major,
+            const int32_t rows,
+            const int32_t cols,
+            const int32_t stride) :
+      data_(data),
+      matrix_major_(matrix_major),
+      rows_(rows),
+      cols_(cols),
+      stride_(stride) {}
+  MatrixMap(const MatrixMap &other)
+      : data_(other.data_),
+        matrix_major_(other.matrix_major_),
+        rows_(other.rows_),
+        cols_(other.cols_),
+        stride_(other.stride_) {}
+  MatrixMajor matrix_major() const { return matrix_major_; }
+  int32_t rows() const { return rows_; }
+  int32_t cols() const { return cols_; }
+  int32_t stride() const { return stride_; }
+  int32_t rows_stride() const {
+    return matrix_major_ == ColMajor ? 1 : stride_;
+  }
+  int32_t cols_stride() const {
+    return matrix_major_ == RowMajor ? 1 : stride_;
+  }
+  int32_t size() const { return rows_ * cols_; }
+  T *data() const { return data_; }
+  T *data(int32_t rows, int32_t cols) const {
+    return data_ + rows * rows_stride() + cols * cols_stride();
+  }
+  T &operator()(int32_t row, int32_t col) const { return *data(row, col); }
+  MatrixMap block(int32_t start_row, int32_t start_col, int32_t block_rows,
+                  int32_t block_cols) const {
+    MACE_ASSERT(start_row >= 0);
+    MACE_ASSERT(start_row + block_rows <= rows_);
+    MACE_ASSERT(start_col >= 0);
+    MACE_ASSERT(start_col + block_cols <= cols_);
+    return MatrixMap(data(start_row, start_col),
+                     matrix_major_,
+                     block_rows,
+                     block_cols,
+                     stride_);
+  }
+ private:
+  T *data_;
+  MatrixMajor matrix_major_;
+  int32_t rows_;
+  int32_t cols_;
+  int32_t stride_;
+};
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_OPS_UTILS_MATRIX_H_
--- a/micro/port/BUILD.bazel
+++ b/micro/port/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+load(
+    "//micro:micro.bzl",
+    "if_hexagon_enabled",
+)
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "port",
+    srcs = glob(["*.cc"]),
+    hdrs = glob(["*.h"]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+    ] + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    deps = if_hexagon_enabled([
+        "@hexagon_sdk//:headers_incs",
+        "@hexagon_sdk//:headers_incs_stddef",
+        "@hexagon_tools//:headers_tools_target",
+    ]),
+)
--- a/micro/port/api.cc
+++ b/micro/port/api.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/port/api.h"
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef MACE_ENABLE_HEXAGON
+#include <HAP_perf.h>
+#include <HAP_farf.h>
+#else
+#include <sys/time.h>
+#endif
+namespace micro {
+namespace port {
+namespace api {
+void DebugLog(const char *str) {
+  // you should rewrite this file in the platform source file.
+#ifdef MACE_ENABLE_HEXAGON
+  FARF(ALWAYS, "%s", str);
+#else
+  printf("%s", str);
+#endif
+}
+int64_t NowMicros() {
+  // you should rewrite this file in the platform source file.
+#ifdef MACE_ENABLE_HEXAGON
+  return HAP_perf_get_time_us();
+#else
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+void Abort() {
+  // you should rewrite this file in the platform source file.
+  abort();
+}
+}  // namespace api
+}  // namespace port
+}  // namespace micro
--- a/micro/port/api.h
+++ b/micro/port/api.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_PORT_API_H_
+#define MICRO_PORT_API_H_
+#include <stdint.h>
+namespace micro {
+namespace port {
+namespace api {
+void DebugLog(const char *str);
+int64_t NowMicros();
+void Abort();
+}  // api
+}  // namespace port
+}  // namespace micro
+#endif  // MICRO_PORT_API_H_
--- a/micro/test/ccbaseline/BUILD.bazel
+++ b/micro/test/ccbaseline/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+load(
+    "//micro:micro.bzl",
+    "if_hexagon_enabled",
+    "if_not_hexagon_enabled",
+)
+MACEMC_IDL_FILES = [
+    "macemc/rpc/macemc.idl",
+]
+MACEMC_IDL_HEADERS = [
+    "codegen/macemc.h",
+]
+MACEMC_IDL_SKELS = [
+    "codegen/macemc_skel.c",
+]
+MACEMC_IDL_STUBS = [
+    "codegen/macemc_stub.c",
+]
+genrule(
+    name = "macemc_idl_gen",
+    srcs = MACEMC_IDL_FILES,
+    outs = MACEMC_IDL_HEADERS + MACEMC_IDL_SKELS + MACEMC_IDL_STUBS,
+    cmd = "bash $(location //micro/test/ccutils:qaic) $(@D)/codegen $(SRCS)",
+    tools = ["//micro/test/ccutils:qaic"],
+)
+cc_library(
+    name = "macemc_idl_skel",
+    srcs = MACEMC_IDL_SKELS,
+    hdrs = MACEMC_IDL_HEADERS,
+    copts = [
+        "-Werror",
+        "-std=c99",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "@hexagon_sdk//:headers_dsp",
+    ],
+    alwayslink = True,
+)
+cc_binary(
+    name = "libmacemc_skel.so",
+    srcs = glob(["macemc/rpc/skel/*.cc"]),
+    linkshared = True,
+    deps = [
+        ":macemc_idl_skel",
+        "//micro/codegen:micro_engine",
+        "//micro/include",
+        "//micro/test/ccutils:rpc_skel",
+        "@hexagon_sdk//:headers_dsp",
+    ],
+)
+cc_library(
+    name = "macemc_idl_stub",
+    srcs = MACEMC_IDL_STUBS,
+    hdrs = MACEMC_IDL_HEADERS,
+    copts = [
+        "-Werror",
+        "-std=c99",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    deps = [
+        "@hexagon_sdk//:sdk_arm",
+    ],
+    alwayslink = True,
+)
+cc_library(
+    name = "macemc_stub",
+    srcs = glob(["macemc/rpc/stub/*.cc"]),
+    hdrs = glob(["macemc/rpc/stub/*.h"]),
+    strip_include_prefix = "",
+    deps = [
+        ":macemc_idl_stub",
+        "//micro/test/ccutils:rpc_stub",
+    ],
+    alwayslink = True,
+)
+cc_test(
+    name = "micro_cc_baseline",
+    srcs = glob([
+        "test_baseline_main.cc",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+        "-DMACE_ENABLE_HEXAGON",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":macemc_stub",
+    ],
+)
--- a/micro/test/ccbaseline/macemc/rpc/macemc.idl
+++ b/micro/test/ccbaseline/macemc/rpc/macemc.idl
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "remote.idl"
+interface macemc : remote_handle64 {
+  long run();
+};
--- a/micro/test/ccbaseline/macemc/rpc/skel/macemc.cc
+++ b/micro/test/ccbaseline/macemc/rpc/skel/macemc.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "rpc/skel/base_func.h"
+#include <HAP_farf.h>
+extern void MaceMcRun();
+extern "C" {
+MACE_DEFINE_RANDOM_INPUT(macemc)
+int macemc_run(remote_handle64 h) {
+  MaceMcRun();
+  FARF(ALWAYS, "run end, h=%d", h);
+  return 0;
+}
+}  // extern "C"
--- a/micro/test/ccbaseline/macemc/rpc/skel/micro_tester.cc
+++ b/micro/test/ccbaseline/macemc/rpc/skel/micro_tester.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <HAP_farf.h>
+#include "micro/include/public/micro.h"
+#include "rpc/skel/base_func.h"
+#ifndef MICRO_MODEL_NAME
+#error Please specify model name in the command
+#endif
+namespace micro {
+namespace MICRO_MODEL_NAME {
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+}  // namespace MICRO_MODEL_NAME
+namespace port {
+namespace api {
+int64_t NowMicros();
+}  // namespace api
+}  // namespace port
+namespace testing {
+namespace {
+const int32_t kMicroRunTestTimes = 10;
+const int32_t input0_shape[4] = {1, 1, 128, 9};
+const int32_t input_length = 1 * 1 * 128 * 9;
+float input0[input_length] = {0};
+}  // namespace
+void MicroRunModel() {
+  int64_t t0 = port::api::NowMicros();
+  MaceMicroEngine *micro_engine = NULL;
+  MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine);
+  int64_t t1 = port::api::NowMicros();
+  double init_millis = (t1 - t0) / 1000.0;
+  FARF(ALWAYS, "Total init latency: %fms", init_millis);
+  if (micro_engine == NULL) {
+    FARF(ALWAYS, "GetMicroEngineSingleton failed");
+    return;
+  }
+  rpc::skel::FillRandomValue(input0, input_length * sizeof(float));
+  micro_engine->RegisterInputData(0, input0, input0_shape);
+  // warm up
+  t0 = port::api::NowMicros();
+  if (micro_engine->Run() != MACE_SUCCESS) {
+    FARF(ALWAYS, "warm up error");
+    return;
+  } else {
+    t1 = port::api::NowMicros();
+    double run_millis = (t1 - t0) / 1000.0;
+    FARF(ALWAYS, "run latency for cold start: %fms", run_millis);
+  }
+  // run
+  t0 = port::api::NowMicros();
+  for (int32_t i = 0; i < kMicroRunTestTimes; ++i) {
+    micro_engine->Run();
+  }
+  t1 = port::api::NowMicros();
+  double run_millis = (t1 - t0) / kMicroRunTestTimes / 1000.0;
+  FARF(ALWAYS, "run latency: %fms", run_millis);
+}
+}  // namespace testing
+}  // namespace micro
+void MaceMcRun() {
+  micro::testing::MicroRunModel();
+}
--- a/micro/test/ccbaseline/macemc/rpc/stub/macemc.cc
+++ b/micro/test/ccbaseline/macemc/rpc/stub/macemc.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "macemc/rpc/stub/macemc.h"
+#include "micro/test/ccbaseline/codegen/macemc.h"
+namespace micro {
+namespace testing {
+namespace {
+const char kMaceMcUri[] = macemc_URI"&_dom=sdsp";
+}  // namespace
+MaceMc::MaceMc() :
+    rpc::stub::BaseHandle(macemc_open, macemc_close, kMaceMcUri) {}
+void MaceMc::Run() {
+  macemc_run(remote_handle_);
+}
+}  // namespace testing
+}  // namespace micro
+void MaceMcBaselineRun() {
+  micro::testing::MaceMc mace_mc;
+  mace_mc.Open();
+  mace_mc.Run();
+  mace_mc.Close();
+}
--- a/micro/test/ccbaseline/macemc/rpc/stub/macemc.h
+++ b/micro/test/ccbaseline/macemc/rpc/stub/macemc.h
+// Copyright 2020 The MICRO Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCBASELINE_MACEMC_RPC_STUB_MACEMC_H_
+#define MICRO_TEST_CCBASELINE_MACEMC_RPC_STUB_MACEMC_H_
+#include "rpc/stub/base_handle.h"
+namespace micro {
+namespace testing {
+class MaceMc : public rpc::stub::BaseHandle {
+ public:
+  MaceMc();
+  void Run();
+};
+}  // namespace testing
+}  // namespace micro
+void MaceMcBaselineRun();
+#endif  // MICRO_TEST_CCBASELINE_MACEMC_RPC_STUB_MACEMC_H_
--- a/micro/test/ccbaseline/test_baseline_main.cc
+++ b/micro/test/ccbaseline/test_baseline_main.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+void MaceMcBaselineRun();
+int main(int argc, char *argv[]) {
+  (void) (argc);
+  (void) (argv);
+  MaceMcBaselineRun();
+  return 0;
+}
--- a/micro/test/ccbenchmark/BUILD.bazel
+++ b/micro/test/ccbenchmark/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+load(
+    "//micro:micro.bzl",
+    "if_hexagon_enabled",
+    "if_not_hexagon_enabled",
+)
+IDL_FILES = [
+    "micro/rpc/benchmark.idl",
+]
+IDL_HEADERS = [
+    "codegen/benchmark.h",
+]
+IDL_SKELS = [
+    "codegen/benchmark_skel.c",
+]
+IDL_STUBS = [
+    "codegen/benchmark_stub.c",
+]
+genrule(
+    name = "idl_gen",
+    srcs = IDL_FILES,
+    outs = IDL_HEADERS + IDL_SKELS + IDL_STUBS,
+    cmd = "bash $(location //micro/test/ccutils:qaic) $(@D)/codegen $(SRCS)",
+    tools = ["//micro/test/ccutils:qaic"],
+)
+cc_library(
+    name = "benchmark_idl_skel",
+    srcs = IDL_SKELS,
+    hdrs = IDL_HEADERS,
+    deps = [
+        "@hexagon_sdk//:headers_dsp",
+    ],
+    alwayslink = True,
+)
+cc_binary(
+    name = "libbenchmark_skel.so",
+    srcs = glob(["micro/rpc/skel/*.c"]),
+    deps = [
+        ":benchmark_idl_skel",
+        ":benchmark_lib",
+        ":benchmark_utils",
+        "//micro/test/ccutils:rpc_skel",
+        "@hexagon_sdk//:headers_dsp",
+    ],
+    linkshared = True,
+    linkstatic = 0,
+)
+cc_library(
+    name = "benchmark_idl_stub",
+    srcs = IDL_STUBS,
+    hdrs = IDL_HEADERS,
+    deps = [
+        "@hexagon_sdk//:sdk_arm",
+    ],
+    alwayslink = True,
+)
+cc_library(
+    name = "benchmark_stub",
+    srcs = glob(["micro/rpc/stub/*.cc"]),
+    hdrs = glob(["micro/rpc/stub/*.h"]),
+    strip_include_prefix = "",
+    deps = [
+        ":benchmark_idl_stub",
+        "//micro/test/ccutils:rpc_stub",
+    ],
+    alwayslink = True,
+)
+cc_library(
+    name = "benchmark_utils",
+    srcs = glob([
+        "micro/benchmark_utils/*.cc",
+    ]),
+    hdrs = glob([
+        "micro/benchmark_utils/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    strip_include_prefix = "",
+    deps = [
+        "//micro/base",
+        "//micro/test/ccutils",
+    ],
+    alwayslink = True,
+)
+cc_library(
+    name = "benchmark_lib",
+    srcs = glob([
+        "micro/ops/*.cc",
+        "micro/ops/nhwc/*.cc",
+    ]),
+    hdrs = glob([
+        "micro/benchmark_utils/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    strip_include_prefix = "",
+    deps = [
+        "benchmark_utils",
+        "//micro/ops:ops_for_test",
+        "//micro/test/ccutils",
+    ],
+    alwayslink = True,
+)
+cc_test(
+    name = "micro_cc_benchmark",
+    srcs = glob(
+        [
+            "micro/test_benchmark_main.cc",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ] + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]),
+    linkstatic = 1,
+    deps = if_hexagon_enabled([
+        ":benchmark_stub",
+    ]) + if_not_hexagon_enabled([
+        ":benchmark_lib",
+    ]),
+)
--- a/micro/test/ccbenchmark/micro/benchmark_utils/test_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/benchmark_utils/test_benchmark.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/common/global_buffer.h"
+#include "micro/port/api.h"
+namespace micro {
+namespace base {
+template<typename T>
+char *ToString(T value, char *buffer, char *end);
+template<>
+char *ToString(float value, char *buffer, char *end);
+template<>
+char *ToString(int32_t value, char *buffer, char *end);
+template<>
+char *ToString(int64_t value, char *buffer, char *end);
+}  // namespace base
+namespace testing {
+namespace {
+const int32_t kMaxBenchmarkNum = 200;
+const int32_t kNameWidth = 50 + 1;
+const int32_t kInt64ValueBufferLength = 21;
+const int32_t kInt32ValueBufferLength = 12;
+const int32_t kFloatValueBufferLength = 21;
+void GetFixWidthStr(const char *input, char *output, const int32_t fix_width) {
+  int32_t length = micro::base::strlen(input);
+  if (length >= fix_width) {
+    micro::base::memcpy(output, input, fix_width * sizeof(char));
+  } else {
+    micro::base::memcpy(output, input, length * sizeof(char));
+    while (length < fix_width) {
+      output[length++] = ' ';
+    }
+  }
+  output[fix_width] = '\0';
+}
+void GetFixWidthStr(int32_t input, char *output, const int32_t fix_width) {
+  char int_str[kInt32ValueBufferLength] = {0};
+  micro::base::ToString(input, int_str, int_str + kInt32ValueBufferLength);
+  GetFixWidthStr(int_str, output, fix_width);
+}
+void GetFixWidthStr(int64_t input, char *output, const int32_t fix_width) {
+  char int_str[kInt64ValueBufferLength] = {0};
+  micro::base::ToString(input, int_str, int_str + kInt64ValueBufferLength);
+  GetFixWidthStr(int_str, output, fix_width);
+}
+void GetFixWidthStr(float input, char *output, const int32_t fix_width) {
+  char int_str[kFloatValueBufferLength] = {0};
+  micro::base::ToString(input, int_str, int_str + kFloatValueBufferLength);
+  GetFixWidthStr(int_str, output, fix_width);
+}
+Benchmark *all_benchmarks[kMaxBenchmarkNum] = {NULL};
+int32_t benchmark_size = 0;
+int64_t bytes_processed;
+int64_t macs_processed = 0;
+int64_t accum_time = 0;
+int64_t start_time = 0;
+}  // namespace
+Benchmark::Benchmark(const char *name, BenchmarkFunc *benchmark_func)
+    : name_(name), benchmark_func_(benchmark_func) {
+  Register();
+}
+void Benchmark::Run() {
+  LOG(INFO) << "Benchmark::Run start, benchmark_size=" << benchmark_size;
+  if (benchmark_size == 0) {
+    return;
+  }
+  char benchmark_name[kNameWidth] = {0};
+  GetFixWidthStr("Benchmark", benchmark_name, kNameWidth - 1);
+  char time_name[kInt64ValueBufferLength] = {0};
+  GetFixWidthStr("Time(ns)", time_name, kInt64ValueBufferLength - 1);
+  char iterations_name[kInt32ValueBufferLength] = {0};
+  GetFixWidthStr("Iterations", iterations_name, kInt32ValueBufferLength - 1);
+  char input_mb_name[kFloatValueBufferLength] = {0};
+  GetFixWidthStr("Input(MB/s)", input_mb_name, kFloatValueBufferLength - 1);
+  LOG(CLEAN) << benchmark_name << "\t" << time_name << "\t" << iterations_name
+             << "\t" << input_mb_name << "\t" << "GMACPS";
+  LOG(CLEAN) << "--------------------------------------------------------------"
+                "-------------------------------------------------------------";
+  for (int32_t i = 0; i < benchmark_size; ++i) {
+    Benchmark *b = all_benchmarks[i];
+    int32_t iters;
+    double seconds;
+    b->Run(&iters, &seconds);
+    float mbps = (bytes_processed * 1e-6) / seconds;
+    // MACCs or other computations
+    float gmacs = (macs_processed * 1e-9) / seconds;
+    int64_t ns = static_cast<int64_t>(seconds * 1e9);
+    char name_str[kNameWidth] = {0};
+    GetFixWidthStr(b->name_, name_str, kNameWidth - 1);
+    char ns_str[kInt64ValueBufferLength] = {0};
+    GetFixWidthStr(ns / iters, ns_str, kInt64ValueBufferLength - 1);
+    char iters_str[kInt32ValueBufferLength] = {0};
+    GetFixWidthStr(iters, iters_str, kInt32ValueBufferLength - 1);
+    char mbps_str[kFloatValueBufferLength] = {0};
+    GetFixWidthStr(mbps, mbps_str, kFloatValueBufferLength - 1);
+    char gmacs_str[kInt32ValueBufferLength] = {0};
+    if (gmacs != 0) {
+      GetFixWidthStr(gmacs, gmacs_str, kInt32ValueBufferLength - 1);
+    } else {
+      gmacs_str[0] = '-';
+    }
+    LOG(CLEAN) << name_str << "\t" << ns_str << "\t"
+               << iters_str << "\t" << mbps_str << "\t" << gmacs_str;
+  }
+}
+void Benchmark::Register() {
+  MACE_ASSERT2(benchmark_size < kMaxBenchmarkNum,
+               "benchmark_size is:", benchmark_size);
+  all_benchmarks[benchmark_size++] = this;
+}
+void Benchmark::Run(int32_t *run_count, double *run_seconds) {
+  static const int32_t kMinIters = 10;
+  static const int32_t kMaxIters = 10000;
+  static const double kMinTime = 0.5;
+  int32_t iters = kMinIters;
+  while (true) {
+    bytes_processed = -1;
+    macs_processed = 0;
+    common::test::GetGlobalBuffer()->reset();
+    RestartTiming();
+    (*benchmark_func_)(iters);
+    StopTiming();
+    const double seconds = accum_time * 1e-6;
+    if (seconds >= kMinTime || iters >= kMaxIters) {
+      *run_count = iters;
+      *run_seconds = seconds;
+      return;
+    }
+    // Update number of iterations.
+    // Overshoot by 100% in an attempt to succeed the next time.
+    double multiplier = 2.0 * kMinTime / base::max(seconds, 1e-9);
+    iters = base::min<int64_t>(multiplier * iters, kMaxIters);  // NOLINT
+  }
+}
+void BytesProcessed(int64_t n) { bytes_processed = n; }
+void MacsProcessed(int64_t n) { macs_processed = n; }
+void RestartTiming() {
+  accum_time = 0;
+  start_time = port::api::NowMicros();
+}
+void StartTiming() {
+  start_time = port::api::NowMicros();
+}
+void StopTiming() {
+  if (start_time != 0) {
+    accum_time += (port::api::NowMicros() - start_time);
+    start_time = 0;
+  }
+}
+}  // namespace testing
+}  // namespace micro
+extern "C" {
+void BenchmarkRun() {
+  micro::testing::Benchmark::Run();
+}
+}
--- a/micro/test/ccbenchmark/micro/benchmark_utils/test_benchmark.h
+++ b/micro/test/ccbenchmark/micro/benchmark_utils/test_benchmark.h
+// Copyright 2019 The MICRO Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Simple benchmarking facility.
+#ifndef MICRO_TEST_CCBENCHMARK_MICRO_BENCHMARK_UTILS_TEST_BENCHMARK_H_
+#define MICRO_TEST_CCBENCHMARK_MICRO_BENCHMARK_UTILS_TEST_BENCHMARK_H_
+#include <stdlib.h>
+#define MICRO_BENCHMARK(n) \
+  static ::micro::testing::Benchmark __benchmark_##n(#n, (n))
+namespace micro {
+namespace testing {
+typedef void BenchmarkFunc(int32_t iters);
+class Benchmark {
+ public:
+  Benchmark(const char *name, BenchmarkFunc *benchmark_func);
+  static void Run();
+ private:
+  const char *name_;
+  BenchmarkFunc *benchmark_func_;
+  void Register();
+  void Run(int32_t *run_count, double *run_seconds);
+};
+void BytesProcessed(int64_t);
+void MacsProcessed(int64_t);
+void RestartTiming();
+void StartTiming();
+void StopTiming();
+}  // namespace testing
+}  // namespace micro
+extern "C" {
+void BenchmarkRun();
+}
+#endif  // MICRO_TEST_CCBENCHMARK_MICRO_BENCHMARK_UTILS_TEST_BENCHMARK_H_
--- a/micro/test/ccbenchmark/micro/ops/activation_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/activation_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/activation.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+void ActivationBenchmark(const char *activation_type, int iters,
+                         const float *input, const int32_t *input_dims,
+                         float *output, int32_t *output_dims) {
+  micro::testing::StopTiming();
+  const uint32_t arg_type_len = base::strlen(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  MACE_DEFINE_RANDOM_INPUT(float, alpha, input_dims[3]);
+  if (base::strcmp(activation_type, "PRELU") == 0) {
+    substitude_op.AddInput(alpha, input_dims + 3, 1);
+  }
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    activation_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    activation_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_ACTIVATION_MACRO(N, H, W, C, TYPE)                      \
+  static void MICRO_BM##_##TYPE##_##N##_##H##_##W##_##C(int32_t iters) { \
+    const int32_t buffer_length = N * H * W * C;                         \
+    MACE_DEFINE_RANDOM_INPUT(float, input, buffer_length);               \
+    MACE_DEFINE_RANDOM_INPUT(float, input, buffer_length);               \
+    float *output =                                                      \
+        common::test::GetGlobalBuffer()->GetBuffer<float>(buffer_length);\
+    int32_t input_dims[] = {N, H, W, C};                                 \
+    int32_t output_dims[4] = {0};                                        \
+    const int64_t tot = static_cast<int64_t>(iters) * buffer_length;     \
+    micro::testing::BytesProcessed(tot *(sizeof(float)));                \
+    ActivationBenchmark(#TYPE, iters, input,                             \
+                        input_dims, output, output_dims);                \
+  }                                                                      \
+  MICRO_BENCHMARK(MICRO_BM##_##TYPE##_##N##_##H##_##W##_##C)
+#define MICRO_BM_RELU(N, H, W, C) \
+  MICRO_BM_ACTIVATION_MACRO(N, H, W, C, RELU)
+MICRO_BM_RELU(1, 4, 4, 1);
+MICRO_BM_RELU(1, 128, 128, 1);
+#define MICRO_BM_RELUX(N, H, W, C) \
+  MICRO_BM_ACTIVATION_MACRO(N, H, W, C, RELUX)
+MICRO_BM_RELUX(1, 4, 4, 1);
+MICRO_BM_RELUX(1, 128, 128, 1);
+#define MICRO_BM_PRELU(N, H, W, C) \
+  MICRO_BM_ACTIVATION_MACRO(N, H, W, C, PRELU)
+MICRO_BM_PRELU(1, 4, 4, 1);
+MICRO_BM_PRELU(1, 128, 128, 1);
+#define MICRO_BM_TANH(N, H, W, C) \
+  MICRO_BM_ACTIVATION_MACRO(N, H, W, C, TANH)
+MICRO_BM_TANH(1, 4, 4, 1);
+MICRO_BM_TANH(1, 128, 128, 1);
+#define MICRO_BM_SIGMOID(N, H, W, C) \
+  MICRO_BM_ACTIVATION_MACRO(N, H, W, C, SIGMOID)
+MICRO_BM_SIGMOID(1, 4, 4, 1);
+MICRO_BM_SIGMOID(1, 128, 128, 1);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/bias_add_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/bias_add_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/bias_add.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void BiasAdd(int32_t iters, const int32_t N,
+             const int32_t H, const int32_t W, const int32_t C) {
+  micro::testing::StopTiming();
+  BiasAddOp bias_add_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input_length = N * H * W * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input, input_length);
+  MACE_DEFINE_RANDOM_INPUT(T, bias, C);
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(input_length);
+  int32_t input_dims[] = {N, H, W, C};
+  int32_t output_dims[4] = {0};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(bias, input_dims + 3, 1)
+      .AddOutput(output, output_dims, 4);
+  bias_add_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    bias_add_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    bias_add_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_BIAS_ADD_MACRO(N, H, W, C, TYPE)                    \
+  static void MICRO_BM_BIAS_ADD_##N##_##H##_##W##_##C##_##TYPE(      \
+      int32_t iters) {                                               \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    BiasAdd<TYPE>(iters, N, H, W, C);                                \
+  }                                                                  \
+  MICRO_BENCHMARK(MICRO_BM_BIAS_ADD_##N##_##H##_##W##_##C##_##TYPE)
+#define MICRO_BM_BIAS_ADD(N, H, W, C) \
+  MICRO_BM_BIAS_ADD_MACRO(N, H, W, C, float)
+MICRO_BM_BIAS_ADD(1, 128, 128, 1);
+MICRO_BM_BIAS_ADD(1, 128, 128, 3);
+MICRO_BM_BIAS_ADD(1, 64, 64, 3);
+MICRO_BM_BIAS_ADD(1, 56, 56, 16);
+MICRO_BM_BIAS_ADD(1, 28, 28, 32);
+MICRO_BM_BIAS_ADD(1, 14, 14, 128);
+MICRO_BM_BIAS_ADD(1, 14, 14, 256);
+MICRO_BM_BIAS_ADD(1, 7, 7, 1024);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/eltwise_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/eltwise_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/eltwise.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void EltwiseBenchmark(int32_t iters, eltwise::Type type, const int32_t N,
+                      const int32_t H, const int32_t W, const int32_t C) {
+  micro::testing::StopTiming();
+  EltwiseOp<T> eltwise_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input_length = N * H * W * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input0, input_length);
+  MACE_DEFINE_RANDOM_INPUT(T, input1, input_length);
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(input_length);
+  int32_t input_dims[] = {N, H, W, C};
+  int32_t output_dims[4] = {0};
+  T coeffs[] = {1.2, 2.1};
+  substitude_op.AddInput(input0, input_dims, 4)
+      .AddInput(input1, input_dims, 4)
+      .AddArg("type", static_cast<int32_t>(type))
+      .AddRepeatArg("coeff", coeffs, sizeof(coeffs) / sizeof(T))
+      .AddOutput(output, output_dims, 4);
+  eltwise_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    eltwise_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    eltwise_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE)           \
+  static void                                                        \
+      MICRO_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE(  \
+          int32_t iters) {                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    EltwiseBenchmark<TYPE>(                                          \
+        iters, static_cast<eltwise::Type>(ELT_TYPE), N, H, W, C);    \
+  }                                                                  \
+  MICRO_BENCHMARK(                                                   \
+      MICRO_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE)
+#define MICRO_BM_ELTWISE(ELT_TYPE, N, H, W, C) \
+  MICRO_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float)
+MICRO_BM_ELTWISE(2, 1, 32, 32, 8);
+MICRO_BM_ELTWISE(2, 1, 60, 60, 16);
+MICRO_BM_ELTWISE(2, 1, 64, 64, 8);
+MICRO_BM_ELTWISE(0, 1, 32, 32, 8);
+MICRO_BM_ELTWISE(0, 1, 60, 60, 16);
+MICRO_BM_ELTWISE(5, 1, 32, 32, 8);
+MICRO_BM_ELTWISE(5, 1, 60, 60, 16);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/matmul_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/matmul_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/matmul.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void MatMulBenchmark(int32_t iters, const int32_t N,
+                     const int32_t H, const int32_t C, const int32_t OW) {
+  micro::testing::StopTiming();
+  MatMulOp matmul_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input0_length = N * H * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input0, input0_length);
+  const int32_t input1_length = N * C * OW;
+  MACE_DEFINE_RANDOM_INPUT(T, input1, input1_length);
+  const int32_t output_length = N * H * OW;
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(output_length);
+  int32_t input0_dims[] = {N, H, C};
+  int32_t input1_dims[] = {N, C, OW};
+  int32_t output_dims[3] = {0};
+  substitude_op.AddInput(input0, input0_dims, 3)
+      .AddInput(input1, input1_dims, 3)
+      .AddOutput(output, output_dims, 3);
+  matmul_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    matmul_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    matmul_op.Run();
+  }
+}
+template<typename T>
+void MatMulTransposeBenchmark(int32_t iters, const int32_t N, const int32_t H,
+                              const int32_t C, const int32_t OW) {
+  micro::testing::StopTiming();
+  MatMulOp matmul_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input0_length = N * H * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input0, input0_length);
+  const int32_t input1_length = N * OW * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input1, input1_length);
+  const int32_t output_length = N * H * OW;
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(output_length);
+  int32_t input0_dims[] = {N, H, C};
+  int32_t input1_dims[] = {N, OW, C};
+  int32_t output_dims[3] = {0};
+  substitude_op.AddInput(input0, input0_dims, 3)
+      .AddInput(input1, input1_dims, 3)
+      .AddArg("transpose_b", 1)
+      .AddOutput(output, output_dims, 3);
+  matmul_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    matmul_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    matmul_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_MATMUL_MACRO(N, H, C, W, TYPE)                            \
+  static void MICRO_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE(              \
+      int32_t iters) {                                                     \
+    const int64_t macs = N * H * W * C;                                    \
+    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
+    micro::testing::MacsProcessed(macs);                                   \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    MatMulBenchmark<TYPE>(iters, N, H, C, W);                              \
+  }                                                                        \
+  MICRO_BENCHMARK(MICRO_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE)
+#define MICRO_BM_MATMUL_OP(N, H, C, W) \
+  MICRO_BM_MATMUL_MACRO(N, H, C, W, float)
+MICRO_BM_MATMUL_OP(1, 300, 32, 1);
+MICRO_BM_MATMUL_OP(1, 32, 64, 32);
+MICRO_BM_MATMUL_OP(2, 16, 16, 49);
+MICRO_BM_MATMUL_OP(3, 16, 16, 49);
+MICRO_BM_MATMUL_OP(4, 16, 16, 49);
+MICRO_BM_MATMUL_OP(4, 8, 32, 49);
+MICRO_BM_MATMUL_OP(4, 32, 32, 49);
+#define MICRO_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE)                  \
+  static void MICRO_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE(          \
+      int32_t iters) {                                                     \
+    const int64_t macs = N * H * W * C;                                    \
+    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
+    micro::testing::MacsProcessed(macs);                                   \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    MatMulBenchmark<TYPE>(iters, N, H, C, W);                              \
+  }                                                                        \
+  MICRO_BENCHMARK(MICRO_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE)
+#define MICRO_BM_MATMUL_TRANSPOSE(N, H, C, W) \
+  MICRO_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float)
+MICRO_BM_MATMUL_TRANSPOSE(4, 8, 32, 49);
+MICRO_BM_MATMUL_TRANSPOSE(2, 16, 16, 49);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/batch_norm_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/batch_norm_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/batch_norm.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void BatchNorm(int iters, const int N, const int H, const int W, const int C) {
+  micro::testing::StopTiming();
+  BatchNormOp batch_norm_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input_length = N * H * W * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input, input_length);
+  MACE_DEFINE_RANDOM_INPUT(T, scale, static_cast<int32_t>(C));
+  MACE_DEFINE_RANDOM_INPUT(T, offset, static_cast<int32_t>(C));
+  MACE_DEFINE_RANDOM_INPUT(T, mean, static_cast<int32_t>(C));
+  MACE_DEFINE_RANDOM_INPUT(T, var, static_cast<int32_t>(C));
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(input_length);
+  int32_t input_dims[] = {N, H, W, C};
+  int32_t other_dims[] = {C};
+  int32_t output_dims[4] = {0};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(scale, other_dims, 1)
+      .AddInput(offset, other_dims, 1)
+      .AddInput(mean, other_dims, 1)
+      .AddInput(var, other_dims, 1)
+      .AddArg("epsilon", 1e-3)
+      .AddOutput(output, output_dims, 4);
+  batch_norm_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    batch_norm_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    batch_norm_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_BATCH_NORM_MACRO(N, C, H, W, TYPE)                  \
+  static void MICRO_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE(    \
+      int32_t iters) {                                               \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    micro::testing::MacsProcessed(tot);                              \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    BatchNorm<TYPE>(iters, N, H, W, C);                              \
+  }                                                                  \
+  MICRO_BENCHMARK(MICRO_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE)
+#define MICRO_BM_BATCH_NORM(N, C, H, W) \
+  MICRO_BM_BATCH_NORM_MACRO(N, C, H, W, float);
+MICRO_BM_BATCH_NORM(1, 128, 128, 1);
+MICRO_BM_BATCH_NORM(1, 128, 128, 3);
+MICRO_BM_BATCH_NORM(1, 64, 64, 3);
+MICRO_BM_BATCH_NORM(1, 56, 56, 16);
+MICRO_BM_BATCH_NORM(1, 28, 28, 64);
+MICRO_BM_BATCH_NORM(1, 14, 14, 64);
+MICRO_BM_BATCH_NORM(1, 14, 14, 32);
+MICRO_BM_BATCH_NORM(1, 7, 7, 1024);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/conv_2d_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/conv_2d_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/conv_2d_ref.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void Conv2d(int iters,
+            const T *input, const int32_t *input_dims,
+            const T *filter, const int32_t *filter_dims,
+            const T *bias, T *output, int32_t *output_dims,
+            int32_t stride, int32_t dilation, Padding padding) {
+  micro::testing::StopTiming();
+  Conv2dRefOp conv2d_op;
+  framework::SubstituteOp substitude_op;
+  int32_t strides[] = {stride, stride};
+  int32_t dilations[] = {dilation, dilation};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, filter_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", static_cast<int32_t>(padding))
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    conv2d_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    conv2d_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_CONV_2D_MACRO(\
+    N, H, W, C, KH, KW, STRIDE, DILATION, P, OC, TYPE)                     \
+  static void                                                              \
+    MICRO_BM_CONV_2D_##N##_##H##_##W##_##C##_K##KH##x##KW##S##STRIDE##D##\
+        DILATION##_##P##_##OC##_##TYPE(int32_t iters) {                    \
+    const int32_t input_length = N * H * W * C;                            \
+    const int64_t tot = static_cast<int64_t>(iters) * input_length;        \
+    int64_t pad_h = 0, pad_w = 0;                                          \
+    if (P == SAME) {                                                       \
+      pad_h = KH / 2;                                                      \
+      pad_w = KW / 2;                                                      \
+    }                                                                      \
+    int64_t oh =                                                           \
+        (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;     \
+    int64_t ow =                                                           \
+        (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;     \
+    const int64_t macs = N * oh * ow * OC * KH * KW * C;                   \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, input, input_length);                   \
+    const int32_t filter_length = OC * KH * KW * C;                        \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, filter, filter_length);                 \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, bias, (int32_t)OC);                     \
+    const int32_t output_length = N * H * W * OC;                          \
+    TYPE *output =                                                         \
+        common::test::GetGlobalBuffer()->GetBuffer<TYPE>(output_length);   \
+    int32_t input_dims[] = {N, H, W, C};                                   \
+    int32_t filter_dims[] = {OC, KH, KW, C};                               \
+    int32_t output_dims[4] = {0};                                          \
+    micro::testing::MacsProcessed(macs);                                   \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    Conv2d<TYPE>(iters, input, input_dims,                                 \
+                         filter, filter_dims, bias, output,                \
+                         output_dims, STRIDE, DILATION, P);                \
+  }                                                                        \
+  MICRO_BENCHMARK(                                                         \
+      MICRO_BM_CONV_2D_##N##_##H##_##W##_##C##_K##KH##x##KW##S##STRIDE##D##\
+        DILATION##_##P##_##OC##_##TYPE)
+#define MICRO_BM_CONV_2D(N, H, W, C, KH, KW, S, D, P, OC) \
+  MICRO_BM_CONV_2D_MACRO(N, H, W, C, KH, KW, S, D, P, OC, float)
+MICRO_BM_CONV_2D(1, 32, 32, 64, 1, 1, 1, 1, VALID, 32);
+MICRO_BM_CONV_2D(1, 33, 31, 64, 1, 1, 1, 1, VALID, 32);
+MICRO_BM_CONV_2D(1, 32, 32, 64, 3, 3, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 33, 31, 64, 3, 3, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 32, 64, 5, 5, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 5, 5, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 15, 1, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 1, 15, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 7, 7, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 7, 7, 2, 1, SAME, 32);
+MICRO_BM_CONV_2D(1, 32, 31, 64, 7, 7, 3, 1, SAME, 32);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/conv_2d_opt_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/conv_2d_opt_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/conv_2d_c4_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void Conv2dOpt(int iters,
+               const T *input, const int32_t *input_dims,
+               const T *filter, const int32_t *filter_dims,
+               const T *bias, T *output, int32_t *output_dims,
+               int32_t stride, int32_t dilation, Padding padding) {
+  micro::testing::StopTiming();
+  Conv2dC4S4Op conv2d_opt_op;
+  framework::SubstituteOp substitude_op;
+  int32_t strides[] = {stride, stride};
+  int32_t dilations[] = {dilation, dilation};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, filter_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", static_cast<int32_t>(padding))
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv2d_opt_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    conv2d_opt_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    conv2d_opt_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_CONV_2D_OPT_MACRO(\
+    N, H, W, C, KH, KW, STRIDE, DILATION, P, OC, TYPE)                       \
+  static void                                                                \
+    MICRO_BM_CONV_2D_OPT_##N##_##H##_##W##_##C##_K##KH##x##KW##S##STRIDE##D##\
+        DILATION##_##P##_##OC##_##TYPE(int32_t iters) {                      \
+    const int32_t input_length = N * H * W * C;                              \
+    const int64_t tot = static_cast<int64_t>(iters) * input_length;          \
+    int64_t pad_h = 0, pad_w = 0;                                            \
+    if (P == SAME) {                                                         \
+      pad_h = KH / 2;                                                        \
+      pad_w = KW / 2;                                                        \
+    }                                                                        \
+    int64_t oh =                                                             \
+        (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;       \
+    int64_t ow =                                                             \
+        (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;       \
+    const int64_t macs = N * oh * ow * OC * KH * KW * C;                     \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, input, input_length);                     \
+    const int32_t filter_length = OC * KH * KW * C;                          \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, filter, filter_length);                   \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, bias, (int32_t)OC);                       \
+    const int32_t output_length = N * H * W * OC;                            \
+    TYPE *output =                                                           \
+        common::test::GetGlobalBuffer()->GetBuffer<TYPE>(output_length);     \
+    int32_t input_dims[] = {N, H, W, C};                                     \
+    int32_t filter_dims[] = {OC, KH, KW, C};                                 \
+    int32_t output_dims[4] = {0};                                            \
+    micro::testing::MacsProcessed(macs);                                     \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
+    Conv2dOpt<TYPE>(iters, input, input_dims,                                \
+                         filter, filter_dims, bias, output,                  \
+                         output_dims, STRIDE, DILATION, P);                  \
+  }                                                                          \
+  MICRO_BENCHMARK(                                                           \
+      MICRO_BM_CONV_2D_OPT_##N##_##H##_##W##_##C##_K##KH##x##KW##S##STRIDE##\
+        D##DILATION##_##P##_##OC##_##TYPE)
+#define MICRO_BM_CONV_2D_OPT(N, H, W, C, KH, KW, S, D, P, OC) \
+  MICRO_BM_CONV_2D_OPT_MACRO(N, H, W, C, KH, KW, S, D, P, OC, float)
+MICRO_BM_CONV_2D_OPT(1, 32, 32, 64, 1, 1, 1, 1, VALID, 32);
+MICRO_BM_CONV_2D_OPT(1, 33, 31, 64, 1, 1, 1, 1, VALID, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 32, 64, 3, 3, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 33, 31, 64, 3, 3, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 32, 64, 5, 5, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 5, 5, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 15, 1, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 1, 15, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 7, 7, 1, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 7, 7, 2, 1, SAME, 32);
+MICRO_BM_CONV_2D_OPT(1, 32, 31, 64, 7, 7, 3, 1, SAME, 32);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/depthwise_conv_2d_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/depthwise_conv_2d_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_ref.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void Conv2d(int iters,
+            const T *input, const int32_t *input_dims,
+            const T *filter, const int32_t *filter_dims,
+            const T *bias, T *output, int32_t *output_dims,
+            int32_t stride, int32_t dilation, Padding padding) {
+  micro::testing::StopTiming();
+  DepthwiseConv2dRefOp depthwise_conv2d_op;
+  framework::SubstituteOp substitude_op;
+  int32_t strides[] = {stride, stride};
+  int32_t dilations[] = {dilation, dilation};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, filter_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", static_cast<int32_t>(padding))
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    depthwise_conv2d_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    depthwise_conv2d_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_DEPTHWISE_CONV_2D_MACRO(\
+    N, H, W, C, KH, KW, STRIDE, DILATION, P, OC, TYPE)                    \
+  static void                                                             \
+    MICRO_BM_DEPTHWISE_CONV_2D_##N##_##H##_##W##_##C##_K##KH##x##KW##S##\
+        STRIDE##D##DILATION##_##P##_##OC##_##TYPE(int32_t iters) {        \
+    const int32_t input_length = N * H * W * C;                           \
+    const int64_t tot = static_cast<int64_t>(iters) * input_length;       \
+    int64_t pad_h = 0, pad_w = 0;                                         \
+    if (P == SAME) {                                                      \
+      pad_h = KH / 2;                                                     \
+      pad_w = KW / 2;                                                     \
+    }                                                                     \
+    int64_t oh =                                                          \
+        (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;    \
+    int64_t ow =                                                          \
+        (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;    \
+    const int64_t macs = N * oh * ow * OC * KH * KW * C;                  \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, input, input_length);                  \
+    const int32_t filter_length = OC * KH * KW * C;                       \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, filter, filter_length);                \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, bias, (int32_t)OC);                    \
+    const int32_t output_length = N * H * W * OC;                         \
+    TYPE *output =                                                        \
+        common::test::GetGlobalBuffer()->GetBuffer<TYPE>(output_length);  \
+    int32_t input_dims[] = {N, H, W, C};                                  \
+    int32_t filter_dims[] = {OC, KH, KW, C};                              \
+    int32_t output_dims[4] = {0};                                         \
+    micro::testing::MacsProcessed(macs);                                  \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
+    Conv2d<TYPE>(iters, input, input_dims,                                \
+                         filter, filter_dims, bias, output,               \
+                         output_dims, STRIDE, DILATION, P);               \
+  }                                                                       \
+  MICRO_BENCHMARK(                                                        \
+      MICRO_BM_DEPTHWISE_CONV_2D_##N##_##H##_##W##_##C##_K##KH##x##KW##S##\
+        STRIDE##D##DILATION##_##P##_##OC##_##TYPE)
+#define MICRO_BM_DEPTHWISE_CONV_2D(N, H, W, C, KH, KW, S, D, P, OC) \
+  MICRO_BM_DEPTHWISE_CONV_2D_MACRO(N, H, W, C, KH, KW, S, D, P, OC, float)
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 8, 32, 1, 1, 1, 1, VALID, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 9, 7, 32, 1, 1, 1, 1, VALID, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 8, 32, 3, 3, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 9, 7, 32, 3, 3, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 8, 32, 5, 5, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 5, 5, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 15, 1, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 1, 15, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 7, 7, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 7, 7, 2, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D(1, 8, 7, 32, 7, 7, 3, 1, SAME, 1);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/depthwise_conv_2d_opt_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/depthwise_conv_2d_opt_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void DepthwiseConv2dOpt(int iters,
+                        const T *input, const int32_t *input_dims,
+                        const T *filter, const int32_t *filter_dims,
+                        const T *bias, T *output, int32_t *output_dims,
+                        int32_t stride, int32_t dilation, Padding padding) {
+  micro::testing::StopTiming();
+  DepthwiseConv2dKB1S4Op depthwise_conv2d_opt_op;
+  framework::SubstituteOp substitude_op;
+  int32_t strides[] = {stride, stride};
+  int32_t dilations[] = {dilation, dilation};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, filter_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", static_cast<int32_t>(padding))
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv2d_opt_op.Init(
+      NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+      NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    depthwise_conv2d_opt_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    depthwise_conv2d_opt_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_DEPTHWISE_CONV_2D_OPT_MACRO(\
+    N, H, W, C, KH, KW, STRIDE, DILATION, P, OC, TYPE)                     \
+  static void                                                              \
+    MICRO_BM_DEPTHWISE_CONV_2D_OPT_##N##_##H##_##W##_##C##_K##KH##x##KW##S##\
+        STRIDE##D##DILATION##_##P##_##OC##_##TYPE(int32_t iters) {         \
+    const int32_t input_length = N * H * W * C;                            \
+    const int64_t tot = static_cast<int64_t>(iters) * input_length;        \
+    int64_t pad_h = 0, pad_w = 0;                                          \
+    if (P == SAME) {                                                       \
+      pad_h = KH / 2;                                                      \
+      pad_w = KW / 2;                                                      \
+    }                                                                      \
+    int64_t oh =                                                           \
+        (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;     \
+    int64_t ow =                                                           \
+        (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;     \
+    const int64_t macs = N * oh * ow * OC * KH * KW * C;                   \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, input, input_length);                   \
+    const int32_t filter_length = OC * KH * KW * C;                        \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, filter, filter_length);                 \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, bias, (int32_t)OC);                     \
+    const int32_t output_length = N * H * W * OC;                          \
+    TYPE *output =                                                         \
+        common::test::GetGlobalBuffer()->GetBuffer<TYPE>(output_length);   \
+    int32_t input_dims[] = {N, H, W, C};                                   \
+    int32_t filter_dims[] = {OC, KH, KW, C};                               \
+    int32_t output_dims[4] = {0};                                          \
+    micro::testing::MacsProcessed(macs);                                   \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
+    DepthwiseConv2dOpt<TYPE>(iters, input, input_dims,                     \
+                             filter, filter_dims, bias, output,            \
+                             output_dims, STRIDE, DILATION, P);            \
+  }                                                                        \
+  MICRO_BENCHMARK(                                                         \
+      MICRO_BM_DEPTHWISE_CONV_2D_OPT_##N##_##H##_##W##_##C##_K##KH##x##KW##\
+        S##STRIDE##D##DILATION##_##P##_##OC##_##TYPE)
+#define MICRO_BM_DEPTHWISE_CONV_2D_OPT(N, H, W, C, KH, KW, S, D, P, OC) \
+  MICRO_BM_DEPTHWISE_CONV_2D_OPT_MACRO(N, H, W, C, KH, KW, S, D, P, OC, float)
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 8, 32, 1, 1, 1, 1, VALID, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 9, 7, 32, 1, 1, 1, 1, VALID, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 8, 32, 3, 3, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 9, 7, 32, 3, 3, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 8, 32, 5, 5, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 5, 5, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 15, 1, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 1, 15, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 7, 7, 1, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 7, 7, 2, 1, SAME, 1);
+MICRO_BM_DEPTHWISE_CONV_2D_OPT(1, 8, 7, 32, 7, 7, 3, 1, SAME, 1);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/nhwc/pooling_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/nhwc/pooling_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/nhwc/pooling_ref.h"
+#include "micro/ops/nhwc/pooling_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void Pooling(int iters, const T *input, const int32_t *input_dims,
+             T *output, int32_t *output_dims, int32_t kernel,
+             int32_t stride, Padding padding, PoolingType pooling_type) {
+  micro::testing::StopTiming();
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  int32_t strides[] = {stride, stride};
+  int32_t kernels[] = {kernel, kernel};
+  int32_t dilations[] = {1, 1};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddArg("pooling_type", static_cast<int32_t>(pooling_type))
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", static_cast<int32_t>(padding))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    pooling_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    pooling_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_POOLING_MACRO(N, H, W, C, KE, STRIDE, PA, PO, TYPE)     \
+  static void                                                            \
+      MICRO_BM_POOLING_##N##_##H##_##W##_##C##_K##KE##S##STRIDE##_##PA##_\
+        ##PO##_##TYPE(int32_t iters) {                                   \
+    const int32_t input_length = N * H * W * C;                          \
+    const int64_t tot = static_cast<int64_t>(iters) * input_length;      \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));                 \
+    MACE_DEFINE_RANDOM_INPUT(TYPE, input, input_length);                 \
+    const int32_t output_length = input_length;                          \
+    TYPE *output =                                                       \
+        common::test::GetGlobalBuffer()->GetBuffer<TYPE>(output_length); \
+    int32_t input_dims[] = {N, H, W, C};                                 \
+    int32_t output_dims[4] = {0};                                        \
+    Pooling<TYPE>(iters, input, input_dims,                              \
+                  output, output_dims, KE, STRIDE, PA, PO);              \
+  }                                                                      \
+  MICRO_BENCHMARK(                                                       \
+      MICRO_BM_POOLING_##N##_##H##_##W##_##C##_K##KE##S##STRIDE##_##PA##_\
+        ##PO##_##TYPE)
+#define MICRO_BM_POOLING(N, H, W, C, K, S, PA, PO) \
+  MICRO_BM_POOLING_MACRO(N, H, W, C, K, S, PA, PO, float)
+MICRO_BM_POOLING(1, 129, 129, 3, 2, 2, SAME, MAX);
+MICRO_BM_POOLING(1, 65, 65, 3, 2, 2, SAME, MAX);
+MICRO_BM_POOLING(1, 48, 64, 8, 48, 64, VALID, AVG);
+MICRO_BM_POOLING(1, 7, 7, 8, 7, 1, VALID, AVG);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/reduce_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/reduce_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/reduce.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void Reduce(int32_t iters, const int32_t N,
+            const int32_t H, const int32_t W, const int32_t C) {
+  micro::testing::StopTiming();
+  ReduceOp<T> reduce_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input_length = N * H * W * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input, input_length);
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(input_length);
+  int32_t input_dims[] = {N, H, W, C};
+  int32_t output_dims[4] = {0};
+  int32_t axis[] = {1, 2};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("axis", axis, sizeof(axis) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  reduce_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    reduce_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    reduce_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_REDUCE_MACRO(N, H, W, C, TYPE)                      \
+  static void MICRO_BM_REDUCE_##N##_##H##_##W##_##C##_##TYPE(        \
+      int32_t iters) {                                               \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    Reduce<TYPE>(iters, N, H, W, C);                                 \
+  }                                                                  \
+  MICRO_BENCHMARK(MICRO_BM_REDUCE_##N##_##H##_##W##_##C##_##TYPE)
+#define MICRO_BM_REDUCE(N, H, W, C) \
+  MICRO_BM_REDUCE_MACRO(N, H, W, C, float)
+MICRO_BM_REDUCE(1, 128, 128, 1);
+MICRO_BM_REDUCE(4, 64, 64, 3);
+MICRO_BM_REDUCE(2, 128, 128, 1);
+MICRO_BM_REDUCE(2, 28, 28, 32);
+MICRO_BM_REDUCE(1, 32, 32, 16);
+MICRO_BM_REDUCE(1, 48, 64, 8);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/ops/softmax_benchmark.cc
+++ b/micro/test/ccbenchmark/micro/ops/softmax_benchmark.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/benchmark_utils/test_benchmark.h"
+#include "micro/ops/softmax.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+template<typename T>
+void SoftmaxBenchmark(int32_t iters, const int32_t N,
+                      const int32_t H, const int32_t W, const int32_t C) {
+  micro::testing::StopTiming();
+  SoftmaxOp softmax_op;
+  framework::SubstituteOp substitude_op;
+  const int32_t input_length = N * H * W * C;
+  MACE_DEFINE_RANDOM_INPUT(T, input, input_length);
+  T *output = common::test::GetGlobalBuffer()->GetBuffer<T>(input_length);
+  int32_t input_dims[] = {N, H, W, C};
+  int32_t output_dims[4] = {0};
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddOutput(output, output_dims, 4);
+  softmax_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  // Warm-up
+  for (int32_t i = 0; i < 2; ++i) {
+    softmax_op.Run();
+  }
+  micro::testing::StartTiming();
+  while (iters--) {
+    softmax_op.Run();
+  }
+}
+}  // namespace
+#define MICRO_BM_SOFTMAX_MACRO(N, H, W, C, TYPE)                     \
+  static void MICRO_BM_SOFTMAX_##N##_##H##_##W##_##C##_##TYPE(       \
+          int32_t iters) {                                           \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
+    micro::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
+    SoftmaxBenchmark<TYPE>(iters, N, C, H, W);                       \
+  }                                                                  \
+  MICRO_BENCHMARK(MICRO_BM_SOFTMAX_##N##_##H##_##W##_##C##_##TYPE)
+#define MICRO_BM_SOFTMAX(N, H, W, C) \
+  MICRO_BM_SOFTMAX_MACRO(N, H, W, C, float)
+MICRO_BM_SOFTMAX(1, 64, 64, 2);
+MICRO_BM_SOFTMAX(1, 64, 64, 3);
+MICRO_BM_SOFTMAX(1, 32, 32, 4);
+MICRO_BM_SOFTMAX(1, 16, 16, 10);
+MICRO_BM_SOFTMAX(1, 7, 7, 128);
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccbenchmark/micro/rpc/benchmark.idl
+++ b/micro/test/ccbenchmark/micro/rpc/benchmark.idl
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "remote.idl"
+interface benchmark: remote_handle64 {
+  long run();
+};
--- a/micro/test/ccbenchmark/micro/rpc/skel/benchmark.c
+++ b/micro/test/ccbenchmark/micro/rpc/skel/benchmark.c
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <HAP_farf.h>
+#include "rpc/skel/base_func.h"
+extern void BenchmarkRun();
+MACE_DEFINE_RANDOM_INPUT(benchmark)
+int benchmark_run(remote_handle64 h) {
+  BenchmarkRun();
+  FARF(ALWAYS, "run end, h=%d", h);
+  return 0;
+}
--- a/micro/test/ccbenchmark/micro/rpc/stub/benchmark.cc
+++ b/micro/test/ccbenchmark/micro/rpc/stub/benchmark.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/rpc/stub/benchmark.h"
+#include "micro/test/ccbenchmark/codegen/benchmark.h"
+namespace micro {
+namespace testing {
+namespace {
+const char kBenchmarkUri[] = benchmark_URI"&_dom=sdsp";
+}  // namespace
+Benchmark::Benchmark() :
+    rpc::stub::BaseHandle(benchmark_open, benchmark_close, kBenchmarkUri) {}
+void Benchmark::Run() {
+  benchmark_run(remote_handle_);
+}
+}  // namespace testing
+}  // namespace micro
+void BenchmarkRun() {
+  micro::testing::Benchmark benchmark;
+  benchmark.Open();
+  benchmark.Run();
+  benchmark.Close();
+}
--- a/micro/test/ccbenchmark/micro/rpc/stub/benchmark.h
+++ b/micro/test/ccbenchmark/micro/rpc/stub/benchmark.h
+// Copyright 2018 The MICRO Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCBENCHMARK_MICRO_RPC_STUB_BENCHMARK_H_
+#define MICRO_TEST_CCBENCHMARK_MICRO_RPC_STUB_BENCHMARK_H_
+#include "rpc/stub/base_handle.h"
+namespace micro {
+namespace testing {
+class Benchmark : public rpc::stub::BaseHandle {
+ public:
+  Benchmark();
+  void Run();
+};
+}  // namespace testing
+}  // namespace micro
+void BenchmarkRun();
+#endif  // MICRO_TEST_CCBENCHMARK_MICRO_RPC_STUB_BENCHMARK_H_
--- a/micro/test/ccbenchmark/micro/test_benchmark_main.cc
+++ b/micro/test/ccbenchmark/micro/test_benchmark_main.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef MACE_ENABLE_HEXAGON
+#include "micro/rpc/stub/benchmark.h"
+#else
+#include "micro/benchmark_utils/test_benchmark.h"
+#endif
+int main(int argc, char *argv[]) {
+  (void) (argc);
+  (void) (argv);
+  BenchmarkRun();
+  return 0;
+}
--- a/micro/test/ccunit/BUILD.bazel
+++ b/micro/test/ccunit/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+cc_test(
+    name = "micro_cc_test",
+    testonly = 1,
+    srcs = glob(
+        [
+            "micro/model/*.cc",
+            "micro/framework/*.cc",
+            "micro/codegen/*.cc",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//micro/base",
+        "//micro/codegen:generated_models",
+        "//micro/codegen:micro_engine",
+        "//micro/framework",
+        "@gtest//:gtest_main",
+    ],
+)
+cc_test(
+    name = "micro_ops_test",
+    testonly = 1,
+    srcs = glob(
+        [
+            "micro/ops/*.cc",
+            "micro/ops/nhwc/*.cc",
+        ],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//micro/base",
+        "//micro/ops:ops_for_test",
+        "//micro/test/ccutils:ccutils_with_gtest",
+        "@gtest//:gtest_main",
+    ],
+)
--- a/micro/test/ccunit/micro/codegen/engine_test.cc
+++ b/micro/test/ccunit/micro/codegen/engine_test.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+#ifndef MICRO_MODEL_NAME
+#error Please specify model name in the command
+#endif
+namespace micro {
+namespace MICRO_MODEL_NAME {
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+}  // namespace MICRO_MODEL_NAME
+class EngineTest : public ::testing::Test {
+};
+void OutputAllInfo() {
+  MaceMicroEngine *micro_engine = NULL;
+  MACE_ASSERT(MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine)
+                  == MACE_SUCCESS && micro_engine != NULL);
+  float input_buffer[1 * 1 * 128 * 9] = {0};
+  int32_t input_shape[] = {1, 1, 128, 9};
+  micro_engine->RegisterInputData(0, input_buffer, input_shape);
+  MACE_ASSERT(MACE_SUCCESS == micro_engine->Run());
+  void *output_buffer = NULL;
+  const int32_t *output_dims = NULL;
+  uint32_t dim_size = 0;
+  micro_engine->GetOutputData(0, &output_buffer, &output_dims, &dim_size);
+  LOG(INFO) << "EngineTest success, dim_size=" << dim_size;
+}
+TEST_F(EngineTest, OutputAllInfo) {
+  OutputAllInfo();
+}
+}  // namespace micro
--- a/micro/test/ccunit/micro/framework/graph_test.cc
+++ b/micro/test/ccunit/micro/framework/graph_test.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "micro/base/logging.h"
+#include "micro/framework/graph.h"
+#include "micro/include/utils/macros.h"
+#ifndef MICRO_MODEL_NAME
+#error Please specify model name in the command
+#endif
+namespace micro {
+namespace MICRO_MODEL_NAME {
+extern uint8_t kGraphData[];
+}  // namespace MICRO_MODEL_NAME
+namespace framework {
+#ifdef MACE_WRITE_MAGIC
+#define MACE_CHECK_MAGIC_CODE(OBJ_NAME)                    \
+  MACE_ASSERT1(CheckMagic(OBJ_NAME, OBJ_NAME->GetMagic(),  \
+      OBJ_NAME->GetHardCodeMagic()), "CheckMagic failed.")
+bool CheckMagic(const Serialize *serial_obj,
+                SerialUint32 magic, SerialUint32 hard_code_magic) {
+  char str_magic[5] = {0};
+  serial_obj->MagicToString(magic, str_magic);
+  bool succ = (magic == hard_code_magic);
+  if (!succ) {
+    char str_hc_magic[5] = {0};
+    serial_obj->MagicToString(hard_code_magic, str_hc_magic);
+    LOG(INFO) << "The magic is invalid, " << "magic = " << str_magic
+              << ", hard_code_magic = " << str_hc_magic;
+  } else {
+    LOG(INFO) << "OK, The magic is " << str_magic;
+  }
+  return succ;
+}
+#else
+#define MACE_CHECK_MAGIC_CODE(OBJ_NAME)
+#endif
+class GraphTest : public ::testing::Test {
+};
+void OutputOpContextInfo(const Graph *graph, const OpContext *op_context) {
+  LOG(INFO) << "op_idx is: " << op_context->op_idx();
+  uint32_t input_info_size = op_context->input_info_size();
+  LOG(INFO) << "input_info size size is: " << input_info_size;
+  for (uint32_t i = 0; i < input_info_size; ++i) {
+    const OpIOInfo *input_info = op_context->input_info(i);
+    graph->Uint2OpIOInfo(input_info);
+    LOG(INFO) << "op_def_idx_: " << input_info->op_def_idx_
+              << ", output_idx_: " << input_info->output_idx_;
+  }
+}
+void OutputGraphInfo(const Graph *graph) {
+  MACE_CHECK_MAGIC_CODE(graph);
+  uint32_t op_context_size = graph->op_context_size();
+  LOG(INFO) << "op_context size is: " << op_context_size;
+  for (uint32_t i = 0; i < op_context_size; ++i) {
+    OutputOpContextInfo(graph, graph->op_context(i));
+  }
+  uint32_t input_op_idx_size = graph->input_op_idx_size();
+  LOG(INFO) << "input_op_idx size is: " << input_op_idx_size;
+  for (uint32_t i = 0; i < input_op_idx_size; ++i) {
+    LOG(INFO) << "input_op_idx=" << graph->input_op_idx(i);
+  }
+  uint32_t output_info_size = graph->output_info_size();
+  LOG(INFO) << "output_info size is: " << output_info_size;
+  for (uint32_t i = 0; i < output_info_size; ++i) {
+    const OpIOInfo *output_info = graph->output_info(i);
+    graph->Uint2OpIOInfo(output_info);
+    LOG(INFO) << "op_def_idx_ is: " << output_info->op_def_idx_
+              << ", output_idx_ is: " << output_info->output_idx_;
+  }
+}
+void OutputAllInfo(const uint8_t *address) {
+  const Graph *graph = reinterpret_cast<const Graph *>(address);
+  MACE_ASSERT1(graph != NULL, "reinterpret_cast failed.");
+  OutputGraphInfo(graph);
+}
+TEST_F(GraphTest, OutputAllInfo) {
+  LOG(INFO) << "GraphTest start";
+  OutputAllInfo(MICRO_MODEL_NAME::kGraphData);
+}
+}  // namespace framework
+}  // namespace micro
--- a/micro/test/ccunit/micro/model/net_def_test.cc
+++ b/micro/test/ccunit/micro/model/net_def_test.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "micro/base/logging.h"
+#include "micro/include/utils/macros.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/net_def.h"
+#include "micro/model/operator_def.h"
+#ifndef MICRO_MODEL_NAME
+#error Please specify model name in the command
+#endif
+namespace micro {
+namespace MICRO_MODEL_NAME {
+extern uint8_t kNetDef[];
+}  // namespace MICRO_MODEL_NAME
+namespace model {
+#ifdef MACE_WRITE_MAGIC
+#define MACE_CHECK_MAGIC_CODE(OBJ_NAME)                    \
+  MACE_ASSERT1(CheckMagic(OBJ_NAME, OBJ_NAME->GetMagic(),  \
+      OBJ_NAME->GetHardCodeMagic()), "CheckMagic failed.")
+bool CheckMagic(const Serialize *serial_obj,
+                SerialUint32 magic, SerialUint32 hard_code_magic) {
+  char str_magic[5] = {0};
+  serial_obj->MagicToString(magic, str_magic);
+  bool succ = (magic == hard_code_magic);
+  if (!succ) {
+    char str_hc_magic[5] = {0};
+    serial_obj->MagicToString(hard_code_magic, str_hc_magic);
+    LOG(INFO) << "The magic is invalid, " << "magic = " << str_magic
+              << ", hard_code_magic = " << str_hc_magic;
+  } else {
+    LOG(INFO) << "OK, The magic is " << str_magic;
+  }
+  return succ;
+}
+#else
+#define MACE_CHECK_MAGIC_CODE(OBJ_NAME) MACE_UNUSED(OBJ_NAME)
+#endif
+class NetDefTest : public ::testing::Test {
+};
+void OutputArgumentInfo(const Argument *argument) {
+  MACE_CHECK_MAGIC_CODE(argument);
+  LOG(INFO) << "The argument name: " << argument->name();
+}
+void OutputOperatorInfo(const OperatorDef *op_def) {
+  MACE_CHECK_MAGIC_CODE(op_def);
+  LOG(INFO) << "The op_def name: " << op_def->name();
+  uint32_t input_size = op_def->input_size();
+  LOG(INFO) << "\tThe op_def input size: " << input_size;
+  for (uint32_t j = 0; j < input_size; ++j) {
+    LOG(INFO) << "\t\tThe input name: " << op_def->input(j);
+  }
+  auto output_size = op_def->output_size();
+  LOG(INFO) << "\tThe op_def output size: " << output_size;
+  for (uint32_t k = 0; k < output_size; ++k) {
+    LOG(INFO) << "\t\tThe output name: " << op_def->output(k);
+  }
+  auto mem_offset_size = op_def->mem_offset_size();
+  LOG(INFO) << "\tThe mem_offset size: " << mem_offset_size;
+  for (uint32_t k = 0; k < mem_offset_size; ++k) {
+    LOG(INFO) << "\t\tThe " << k << "th mem_offset: " << op_def->mem_offset(k);
+  }
+  auto arg_size = op_def->arg_size();
+  LOG(INFO) << "\tThe arg size: " << arg_size;
+  for (uint32_t k = 0; k < arg_size; ++k) {
+    OutputArgumentInfo(op_def->arg(k));
+  }
+}
+void OutputTensorInfo(const ConstTensor *tensor) {
+  MACE_CHECK_MAGIC_CODE(tensor);
+  LOG(INFO) << "The tensor name: " << tensor->name();
+  auto dim_size = tensor->dim_size();
+  LOG(INFO) << "\tThe tensor dim size: " << dim_size;
+  for (uint32_t i = 0; i < dim_size; ++i) {
+    LOG(INFO) << "\t\ttensor dim[" << i << "] = " << tensor->dim(i);
+  }
+  auto float_data_size = tensor->float_data_size();
+  LOG(INFO) << "\tThe tensor float_data size: " << float_data_size;
+  for (uint32_t i = 0; i < float_data_size; ++i) {
+    const float f_value = tensor->float_data(i);
+    LOG(INFO) << "\t\ttensor float_data[" << i << "] = " << f_value;
+  }
+  if (float_data_size > 0) {
+    MACE_ASSERT(false);
+  }
+}
+void OutputNetDefInfo(const NetDef *net_def) {
+  MACE_CHECK_MAGIC_CODE(net_def);
+  auto op_size = net_def->op_size();
+  LOG(INFO) << "op size is: " << op_size;
+  for (uint32_t i = 0; i < op_size; ++i) {
+    OutputOperatorInfo(net_def->op(i));
+  }
+  auto arg_size = net_def->arg_size();
+  LOG(INFO) << "arg size is: " << arg_size;
+  auto arg_byte_size = sizeof(Argument);
+  LOG(INFO) << "arg byte size is: " << (int32_t) arg_byte_size;
+  for (uint32_t i = 0; i < arg_size; ++i) {
+    OutputArgumentInfo(net_def->arg(i));
+  }
+  auto tensor_size = net_def->tensor_size();
+  LOG(INFO) << "tensor size is: " << tensor_size;
+  for (uint32_t i = 0; i < tensor_size; ++i) {
+    OutputTensorInfo(net_def->tensor(i));
+  }
+  auto data_type = net_def->data_type();
+  LOG(INFO) << "data_type is: " << data_type;
+  auto input_info_size = net_def->input_info_size();
+  LOG(INFO) << "input_info size is: " << input_info_size;
+  for (uint32_t i = 0; i < input_info_size; ++i) {
+    MACE_CHECK_MAGIC_CODE(net_def->input_info(i));
+  }
+  auto output_info_size = net_def->output_info_size();
+  LOG(INFO) << "output_info size is: " << output_info_size;
+  for (uint32_t i = 0; i < output_info_size; ++i) {
+    MACE_CHECK_MAGIC_CODE(net_def->output_info(i));
+  }
+}
+void OutputAllInfo(const uint8_t *address) {
+  const NetDef *net_def = reinterpret_cast<const NetDef *>(address);
+  MACE_ASSERT1(net_def != NULL, "reinterpret_cast failed.");
+  OutputNetDefInfo(net_def);
+}
+TEST_F(NetDefTest, OutputAllInfo) {
+  LOG(INFO) << "NetDefTest start";
+  OutputAllInfo(MICRO_MODEL_NAME::kNetDef);
+}
+}  // namespace model
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/activation_test.cc
+++ b/micro/test/ccunit/micro/ops/activation_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/activation.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ActivationOpTest : public ::testing::Test {};
+namespace {
+void TestSimpleRelu() {
+  float input[16] = {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] = {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "RELU";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestSimpleLeakyRelu() {
+  float input[16] = {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] =
+      {-0.7, 7, -0.6, 6, -0.5, 5, -0.4, 4, -0.3, 3, -0.2, 2, -0.1, 1, 0, 0};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "LEAKYRELU";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddArg("leakyrelu_coefficient", 0.1f)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestUnalignedSimpleRelu() {
+  float input[6] = {-7, 7, -6, 6, -5, 5};
+  int32_t input_dims[4] = {1, 3, 2, 1};
+  float output[6] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[6] = {0, 7, 0, 6, 0, 5};
+  int32_t expect_dims[4] = {1, 3, 2, 1};
+  const char activation_type[] = "RELU";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestSimpleRelux() {
+  float input[16] = {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] = {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "RELUX";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddArg("max_limit", 6)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestSimplePrelu() {
+  float input[16] = {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float alpha[2] = {2.0, 3.0};
+  int32_t alpha_dims[1] = {2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] =
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "PRELU";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(alpha, alpha_dims, 1)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestSimpleTanh() {
+  float input[16] = {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] =
+      {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092,
+       -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758,
+       -0.76159416, 0.76159416, 0., 0.};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "TANH";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void TestSimpleSigmoid() {
+  float input[16] = {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0};
+  int32_t input_dims[4] = {2, 2, 2, 2};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] =
+      {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01,
+       6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01,
+       4.74258732e-02, 9.52574127e-01, 1.19202922e-01, 8.80797078e-01,
+       2.68941421e-01, 7.31058579e-01, 5.00000000e-01, 5.00000000e-01};
+  int32_t expect_dims[4] = {2, 2, 2, 2};
+  const char activation_type[] = "SIGMOID";
+  const uint32_t arg_type_len = sizeof(activation_type);
+  ActivationOp activation_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("activation", activation_type, arg_type_len)
+      .AddOutput(output, output_dims, 4);
+  activation_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  activation_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+}  // namespace
+TEST_F(ActivationOpTest, TestSimpleRelu) {
+  TestSimpleRelu();
+}
+TEST_F(ActivationOpTest, TestSimpleLeakyRelu) {
+  TestSimpleLeakyRelu();
+}
+TEST_F(ActivationOpTest, TestUnalignedSimpleRelu) {
+  TestUnalignedSimpleRelu();
+}
+TEST_F(ActivationOpTest, TestSimpleRelux) {
+  TestSimpleRelux();
+}
+TEST_F(ActivationOpTest, TestSimplePrelu) {
+  TestSimplePrelu();
+}
+TEST_F(ActivationOpTest, TestSimpleTanh) {
+  TestSimpleTanh();
+}
+TEST_F(ActivationOpTest, TestSimpleSigmoid) {
+  TestSimpleSigmoid();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/argmax_test.cc
+++ b/micro/test/ccunit/micro/ops/argmax_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/argmax.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ArgMaxOpTest : public ::testing::Test {};
+namespace {
+void ArgMaxTest(
+    const float *input, const int32_t *input_dims,
+    const int32_t input_dim_size,
+    int32_t *output, int32_t *output_dims, const int32_t output_dim_size,
+    const int32_t *expect, const int32_t *expect_dims) {
+  ArgMaxOp<float> argmax_op;
+  int32_t axis[] = {-1};
+  int32_t axis_dims[1] = {1};
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddInput(axis, axis_dims, 0)
+      .AddOutput(output, output_dims, output_dim_size);
+  argmax_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  argmax_op.Run();
+  ExpectTensorNear<int32_t>(output, output_dims, output_dim_size,
+                            expect, expect_dims, output_dim_size, 1e-5, 1e-3);
+}
+void ArgMaxTextVector() {
+  const float input[3] = {-3, -1, -2};
+  const int32_t input_dims[1] = {3};
+  int32_t output[1] = {0};
+  int32_t output_dims[1] = {0};
+  const int32_t expect[1] = {1};
+  const int32_t expect_dims[1] = {0};
+  ArgMaxTest(input, input_dims, 1,
+             output, output_dims, 0,
+             expect, expect_dims);
+}
+void ArgMaxTextMatrix() {
+  const float input[9] = {4, 5, 6, 9, 8, 7, 1, 2, 3};
+  const int32_t input_dims[2] = {3, 3};
+  int32_t output[3] = {0};
+  int32_t output_dims[1] = {0};
+  const int32_t expect[3] = {2, 0, 2};
+  const int32_t expect_dims[1] = {3};
+  ArgMaxTest(input, input_dims, 1,
+             output, output_dims, 1,
+             expect, expect_dims);
+}
+void ArgMaxTextHighRank() {
+  const float input[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  const int32_t input_dims[4] = {1, 2, 2, 3};
+  int32_t output[4] = {0};
+  int32_t output_dims[3] = {0};
+  const int32_t expect[4] = {2, 2, 2, 2};
+  const int32_t expect_dims[3] = {1, 2, 2};
+  ArgMaxTest(input, input_dims, 4,
+             output, output_dims, 3,
+             expect, expect_dims);
+}
+}  // namespace
+TEST_F(ArgMaxOpTest, Vector) {
+  ArgMaxTextVector();
+}
+TEST_F(ArgMaxOpTest, Matrix) {
+  ArgMaxTextMatrix();
+}
+TEST_F(ArgMaxOpTest, HighRank) {
+  ArgMaxTextHighRank();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/bias_add_test.cc
+++ b/micro/test/ccunit/micro/ops/bias_add_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/bias_add.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class BiasAddOpTest : public ::testing::Test {};
+namespace {
+void BiasAddSimple() {
+  float input[12] = {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+  int32_t input_dims[4] = {1, 6, 2, 1};
+  float bias[1] = {0.5f};
+  int32_t bias_dims[1] = {1};
+  float output[12] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[12] =
+      {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5};
+  int32_t expect_dims[4] = {1, 6, 2, 1};
+  BiasAddOp bias_add_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddOutput(output, output_dims, 4);
+  bias_add_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  bias_add_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+}  // namespace
+TEST_F(BiasAddOpTest, BiasAddSimple) {
+  BiasAddSimple();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/eltwise_test.cc
+++ b/micro/test/ccunit/micro/ops/eltwise_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/eltwise.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class EltwiseOpTest : public ::testing::Test {};
+namespace {
+template<typename T, typename DstType>
+void SimpleScalarScalar(eltwise::Type type, T input_value,
+                        float x, const DstType expect_value) {
+  T input[1] = {input_value};
+  int32_t input_dims[1] = {1};
+  T output[1] = {0};
+  int32_t output_dims[1] = {0};
+  DstType expect[1] = {expect_value};
+  int32_t expect_dims[1] = {1};
+  EltwiseOp<T> eltwise_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 1)
+      .AddArg("type", static_cast<int>(type))
+      .AddArg("scalar_input", x)
+      .AddOutput(output, output_dims, 1);
+  eltwise_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  eltwise_op.Run();
+  ExpectTensorNear<T>(output, output_dims, 1, expect, expect_dims, 1, 1e-5);
+}
+template<typename T, typename DstType>
+void SimpleTensorScalar(eltwise::Type type, const T *input,
+                        const int32_t *input_dims, const int32_t input_dim_size,
+                        float x, const int32_t output_dim_size,
+                        DstType *output, int32_t *output_dims,
+                        const DstType *expect, const int32_t *expect_dims) {
+  EltwiseOp<T> eltwise_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddArg("type", static_cast<int>(type))
+      .AddArg("scalar_input", x)
+      .AddOutput(output, output_dims, output_dim_size);
+  eltwise_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  eltwise_op.Run();
+  ExpectTensorNear<T>(output, output_dims, output_dim_size,
+                      expect, expect_dims, output_dim_size, 1e-5);
+}
+template<typename T, typename DstType>
+void SimpleTensorScalarForSpecial(eltwise::Type type, const T *input,
+                                  float x, const DstType *expect) {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {1, 1, 2, 3};
+  const int32_t output_dim_size = 4;
+  DstType output[6] = {0};
+  int32_t output_dims[output_dim_size] = {0};
+  const int32_t expect_dims[output_dim_size] = {1, 1, 2, 3};
+  SimpleTensorScalar(type, input, input_dims,
+                     input_dim_size, x, output_dim_size,
+                     output, output_dims,
+                     expect, expect_dims);
+}
+void SimpleTensorScalar1() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {1, 1, 1, 1};
+  const float input[] = {1};
+  const int32_t output_dim_size = 4;
+  float output[1] = {0};
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[1] = {2};
+  const int32_t expect_dims[output_dim_size] = {1, 1, 1, 1};
+  SimpleTensorScalar(eltwise::SUM, input, input_dims,
+                     input_dim_size, 1, output_dim_size,
+                     output, output_dims,
+                     expect, expect_dims);
+}
+template<typename T, typename DstType>
+void SimpleTensorEltwise(eltwise::Type type, const T *input0,
+                         const int32_t *input0_dims,
+                         const int32_t input0_dim_size,
+                         const T *input1, const int32_t *input1_dims,
+                         const int32_t input1_dim_size,
+                         DstType *output, int32_t *output_dims,
+                         const int32_t output_dim_size,
+                         const DstType *expect, const int32_t *expect_dims,
+                         const float *coeff = NULL,
+                         const uint32_t coeff_len = 0) {
+  EltwiseOp<T> eltwise_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input0, input0_dims, input0_dim_size)
+      .AddArg("type", static_cast<int>(type))
+      .AddOutput(output, output_dims, output_dim_size);
+  if (input1 != NULL && input1_dims != NULL && input1_dim_size > 0) {
+    substitude_op.AddInput(input1, input1_dims, input1_dim_size);
+  }
+  if (coeff != NULL && coeff_len > 0) {
+    substitude_op.AddRepeatArg("coeff", coeff, coeff_len);
+  }
+  eltwise_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  eltwise_op.Run();
+  ExpectTensorNear<T>(output, output_dims, output_dim_size,
+                      expect, expect_dims, output_dim_size, 1e-5);
+}
+template<typename T, typename DstType>
+void SimpleTensorEltwise(eltwise::Type type, const T *input0,
+                         const int32_t *input0_dims, const T *input1,
+                         const int32_t *input1_dims, DstType *output,
+                         const DstType *expect, const int32_t *expect_dims,
+                         const float *coeff = NULL,
+                         const uint32_t coeff_len = 0) {
+  int32_t output_dims[4] = {0};
+  SimpleTensorEltwise(type, input0, input0_dims, 4, input1, input1_dims, 4,
+                      output, output_dims, 4, expect, expect_dims, coeff,
+                      coeff_len);
+}
+}  // namespace
+TEST_F(EltwiseOpTest, SimpleScalarScalar) {
+  SimpleScalarScalar<float, float>(eltwise::SUM, 1, 2, 3);
+  SimpleScalarScalar<float, float>(eltwise::SUB, 1, 2, -1);
+  SimpleScalarScalar<float, float>(eltwise::PROD, 1, 2, 2);
+  SimpleScalarScalar<float, float>(eltwise::DIV, 1, 2, 0.5);
+  SimpleScalarScalar<float, float>(eltwise::FLOOR_DIV, 1, 2, 0);
+  SimpleScalarScalar<float, float>(eltwise::FLOOR_DIV, 1, -2, -1);
+  SimpleScalarScalar<float, float>(eltwise::MIN, 1, 2, 1);
+  SimpleScalarScalar<float, float>(eltwise::MAX, 1, 2, 2);
+  SimpleScalarScalar<float, float>(eltwise::NEG, 1, 2, -1);
+  SimpleScalarScalar<float, float>(eltwise::ABS, -1, 3, 1);
+  SimpleScalarScalar<float, float>(eltwise::SIGN, -2, 3, -1);
+  SimpleScalarScalar<int32_t, int32_t>(eltwise::EQUAL, 1, 3, 0);
+  SimpleScalarScalar<int32_t, int32_t>(eltwise::EQUAL, 3, 3, 1);
+}
+TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
+  SimpleTensorScalar1();
+  const float input[] = {1, 2, 3, 4, 5, 6};
+  const float expect2[] = {0, 1, 2, 3, 4, 5};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::SUB, input, 1, expect2);
+  const float expect3[] = {2, 4, 6, 8, 10, 12};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::PROD, input, 2, expect3);
+  const float expect4[] = {1, 1, 1, 1, 1, 1};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::MIN, input, 1, expect4);
+  const float expect5[] = {3, 3, 3, 4, 5, 6};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::MAX, input, 3, expect5);
+  const float expect6[] = {-1, -2, -3, -4, -5, -6};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::NEG, input, 3, expect6);
+  const float expect7[] = {0, 1, 4, 9, 16, 25};
+  SimpleTensorScalarForSpecial<float, float>(
+      eltwise::SQR_DIFF, input, 1, expect7);
+  const int32_t input_i[] = {1, 2, 3, 4, 5, 6};
+  const int32_t expect8[] = {0, 0, 1, 0, 0, 0};
+  SimpleTensorScalarForSpecial<int32_t, int32_t>(
+      eltwise::EQUAL, input_i, 3, expect8);
+  const float input9[] = {2, 4, 6, 8, 10, 12};
+  const float expect9[] = {1, 2, 3, 4, 5, 6};
+  SimpleTensorScalarForSpecial<float, float>(eltwise::DIV, input9, 2, expect9);
+  const float expect10[] = {0, 1, 2, 2, 3, 4};
+  SimpleTensorScalarForSpecial<float, float>(
+      eltwise::FLOOR_DIV, input9, 3, expect10);
+  const float expect11[] = {-1, -2, -2, -3, -4, -4};
+  SimpleTensorScalarForSpecial<float, float>(
+      eltwise::FLOOR_DIV, input9, -3, expect11);
+  const float input12[] = {-1, -2, -3, -4, -5, -6};
+  const float expect12[] = {1, 2, 3, 4, 5, 6};
+  SimpleTensorScalarForSpecial<float, float>(
+      eltwise::ABS, input12, 3, expect12);
+  const float input13[] = {1, 2, -3, 0, -5, -6};
+  const float expect13[] = {1, 1, -1, 0, -1, -1};
+  SimpleTensorScalarForSpecial<float, float>(
+      eltwise::SIGN, input13, 3, expect13);
+}
+TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
+  const int32_t dims1123[] = {1, 1, 2, 3};
+  const int32_t dims1113[] = {1, 1, 1, 3};
+  const int32_t dims1215[] = {1, 2, 1, 5};
+  const int32_t dims1115[] = {1, 1, 1, 5};
+  const int32_t dims1213[] = {1, 2, 1, 3};
+  const int32_t dims3[] = {3};
+  const int32_t dims5[] = {5};
+  float output6[6] = {0};
+  float output10[10] = {0};
+  int32_t output6_i[6] = {0};
+  int32_t output_dims4[4] = {0};
+  const float input0_0[] = {1, 2, 3, 4, 5, 6};
+  const float input1_0[] = {1, 2, 3};
+  const float expect_0[] = {2, 4, 6, 5, 7, 9};
+  SimpleTensorEltwise(eltwise::SUM, input0_0, dims1123, input1_0,
+                      dims1113, output6, expect_0, dims1123);
+  const float input0_1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  const float input1_1[] = {1, 2, 3, 4, 5};
+  const float expect_1[] = {0, 0, 0, 0, 0, 5, 5, 5, 5, 5};
+  SimpleTensorEltwise(eltwise::SUB, input0_1, dims1215, input1_1,
+                      dims1115, output10, expect_1, dims1215);
+  const float expect_2[] = {0, 0, 0, 0, 0, -5, -5, -5, -5, -5};
+  SimpleTensorEltwise(eltwise::SUB, input1_1, dims1115, input0_1,
+                      dims1215, output10, expect_2, dims1215);
+  const float expect_3[] = {1, 4, 9, 4, 10, 18};
+  SimpleTensorEltwise(eltwise::PROD, input1_0, dims1113, input0_0,
+                      dims1213, output6, expect_3, dims1213);
+  const float input1_4[] = {1, 1, 1, 1, 5};
+  const float expect_4[] = {1, 2, 3, 4, 1, 6, 7, 8, 9, 2};
+  SimpleTensorEltwise(eltwise::DIV, input0_1, dims1215, input1_4,
+                      dims1115, output10, expect_4, dims1215);
+  const float input0_5[] = {1, 1, 1, 2, 4};
+  const float input1_5[] = {1, 1, 1, 2, 2, 1, 1, 1, 1, 1};
+  const float expect_5[] = {1, 1, 1, 1, 2, 1, 1, 1, 2, 4};
+  SimpleTensorEltwise(eltwise::DIV, input0_5, dims1115, input1_5,
+                      dims1215, output10, expect_5, dims1215);
+  const float input1_6[] = {2, 2, 2, 2, 3};
+  const float expect_6[] = {0, 1, 1, 2, 1, 3, 3, 4, 4, 3};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_1, dims1215, input1_6,
+                      dims1115, output10, expect_6, dims1215);
+  const float input1_7[] = {-2, -2, -2, -2, -3};
+  const float expect_7[] = {-1, -1, -2, -2, -2, -3, -4, -4, -5, -4};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_1, dims1215, input1_7,
+                      dims1115, output10, expect_7, dims1215);
+  const float input1_8[] = {2, 2, 2, 3, 3, 2, 2, 2, 2, 2};
+  const float expect_8[] = {0, 0, 0, 0, 1, 0, 0, 0, 1, 2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_5, dims1115, input1_8,
+                      dims1215, output10, expect_8, dims1215);
+  const float input1_9[] = {-2, -2, -2, -3, -3, -2, -2, -2, -2, -2};
+  const float expect_9[] = {-1, -1, -1, -1, -2, -1, -1, -1, -1, -2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_5, dims1115, input1_9,
+                      dims1215, output10, expect_9, dims1215);
+  const float expect_10[] = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5};
+  SimpleTensorEltwise(eltwise::MIN, input1_1, dims1115, input0_1,
+                      dims1215, output10, expect_10, dims1215);
+  const float expect_11[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  SimpleTensorEltwise(eltwise::MAX, input0_1, dims1215, input1_1,
+                      dims1115, output10, expect_11, dims1215);
+  const float expect_12[] = {0, 0, 0, 0, 0, 25, 25, 25, 25, 25};
+  SimpleTensorEltwise(eltwise::SQR_DIFF, input1_1, dims1115, input0_1,
+                      dims1215, output10, expect_12, dims1215);
+  const int32_t input0_13[] = {1, 2, 3, 4, 5, 6};
+  const int32_t input1_13[] = {1, 2, 3};
+  const int32_t expect_13[] = {1, 1, 1, 0, 0, 0};
+  SimpleTensorEltwise(eltwise::EQUAL, input0_13, dims1123, input1_13,
+                      dims1113, output6_i, expect_13, dims1123);
+  const float expect_14[] = {2, 4, 6, 5, 7, 9};
+  SimpleTensorEltwise(eltwise::SUM, input0_0, dims1123,
+                      4, input1_0, dims3, 1, output6,
+                      output_dims4, 4, expect_14, dims1123);
+  const float expect_15[] = {0, 0, 0, 0, 0, 5, 5, 5, 5, 5};
+  SimpleTensorEltwise(eltwise::SUB, input0_1, dims1215,
+                      4, input1_1, dims5, 1, output10,
+                      output_dims4, 4, expect_15, dims1215);
+  const float expect_16[] = {0, 0, 0, 0, 0, -5, -5, -5, -5, -5};
+  SimpleTensorEltwise(eltwise::SUB, input1_1, dims5,
+                      1, input0_1, dims1215, 4, output10,
+                      output_dims4, 4, expect_16, dims1215);
+  const float expect_17[] = {1, 4, 9, 4, 10, 18};
+  SimpleTensorEltwise(eltwise::PROD, input1_0, dims3,
+                      1, input0_0, dims1213, 4, output6,
+                      output_dims4, 4, expect_17, dims1213);
+  const float expect_18[] = {1, 2, 3, 4, 1, 6, 7, 8, 9, 2};
+  SimpleTensorEltwise(eltwise::DIV, input0_1, dims1215,
+                      4, input1_4, dims5, 1, output10,
+                      output_dims4, 4, expect_18, dims1215);
+  const float expect_19[] = {1, 1, 1, 1, 2, 1, 1, 1, 2, 4};
+  SimpleTensorEltwise(eltwise::DIV, input0_5, dims5,
+                      1, input1_5, dims1215, 4, output10,
+                      output_dims4, 4, expect_19, dims1215);
+  const float expect_20[] = {0, 1, 1, 2, 1, 3, 3, 4, 4, 3};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_1, dims1215,
+                      4, input1_6, dims5, 1, output10,
+                      output_dims4, 4, expect_20, dims1215);
+  const float expect_21[] = {-1, -1, -2, -2, -2, -3, -4, -4, -5, -4};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_1, dims1215,
+                      4, input1_7, dims5, 1, output10, output_dims4,
+                      4, expect_21, dims1215);
+  const float expect_22[] = {0, 0, 0, 0, 1, 0, 0, 0, 1, 2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_5, dims5, 1, input1_8,
+                      dims1215, 4, output10, output_dims4,
+                      4, expect_22, dims1215);
+  const float expect_23[] = {-1, -1, -1, -1, -2, -1, -1, -1, -1, -2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_5, dims5, 1, input1_9,
+                      dims1215, 4, output10, output_dims4,
+                      4, expect_23, dims1215);
+  const float expect_24[] = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5};
+  SimpleTensorEltwise(eltwise::MIN, input1_1, dims5, 1, input0_1,
+                      dims1215, 4, output10, output_dims4,
+                      4, expect_24, dims1215);
+  const float expect_25[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  SimpleTensorEltwise(eltwise::MAX, input0_1, dims1215, 4, input1_1,
+                      dims5, 1, output10, output_dims4, 4,
+                      expect_25, dims1215);
+  const float expect_26[] = {0, 0, 0, 0, 0, 25, 25, 25, 25, 25};
+  SimpleTensorEltwise(eltwise::SQR_DIFF, input1_1, dims5, 1, input0_1,
+                      dims1215, 4, output10, output_dims4, 4,
+                      expect_26, dims1215);
+  const int32_t expect_27[] = {1, 1, 1, 0, 0, 0};
+  SimpleTensorEltwise(eltwise::EQUAL, input0_13, dims1123, 4, input1_13,
+                      dims3, 1, output6_i, output_dims4, 4,
+                      expect_27, dims1123);
+}
+TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
+  const int32_t dims1123[] = {1, 1, 2, 3};
+  const int32_t dims1215[] = {1, 2, 1, 5};
+  const int32_t dims1115[] = {1, 1, 1, 5};
+  const int32_t dims1213[] = {1, 2, 1, 3};
+  float output6[6] = {0};
+  float output10[10] = {0};
+  int32_t output6_i[6] = {0};
+  int32_t output_dims4[4] = {0};
+  const float input0_0[] = {1, 2, 3, 4, 5, 6};
+  const float expect_0[] = {2, 4, 6, 8, 10, 12};
+  SimpleTensorEltwise(eltwise::SUM, input0_0, dims1123, input0_0,
+                      dims1123, output6, expect_0, dims1123);
+  const float expect_1[] = {0.2, 0.4, 0.6, 0.8, 1, 1.2};
+  const float coeff_1[] = {0.1, 0.1};
+  SimpleTensorEltwise(eltwise::SUM, input0_0, dims1123, input0_0,
+                      dims1123, output6, expect_1, dims1123, coeff_1,
+                      sizeof(coeff_1)/ sizeof(float));
+  const float input0_2[] = {1, 2, 3, 4, 5};
+  const float expect_2[] = {0, 0, 0, 0, 0};
+  SimpleTensorEltwise(eltwise::SUB, input0_2, dims1115, input0_2,
+                      dims1115, output6, expect_2, dims1115);
+  const float expect_3[] = {1, 4, 9, 16, 25, 36};
+  SimpleTensorEltwise(eltwise::PROD, input0_0, dims1213, input0_0,
+                      dims1213, output6, expect_3, dims1213);
+  const float expect_4[] = {1, 1, 1, 1, 1, 1};
+  SimpleTensorEltwise(eltwise::DIV, input0_0, dims1213, input0_0,
+                      dims1213, output6, expect_4, dims1213);
+  const float input0_5[] = {2, 3, 4, 5, 6, 7};
+  const float expect_5[] = {2, 1, 1, 1, 1, 1};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_5, dims1213, input0_0,
+                      dims1213, output6, expect_5, dims1213);
+  const float input0_6[] = {-2, -3, -4, -5, -6, -7};
+  const float expect_6[] = {-2, -2, -2, -2, -2, -2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_6, dims1213, input0_0,
+                      dims1213, output6, expect_6, dims1213);
+  const float input0_7[] = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5};
+  const float input1_7[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  const float expect_7[] = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5};
+  SimpleTensorEltwise(eltwise::MIN, input0_7, dims1215, input1_7,
+                      dims1215, output10, expect_7, dims1215);
+  const float expect_8[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  SimpleTensorEltwise(eltwise::MAX, input1_7, dims1215, input0_7,
+                      dims1215, output10, expect_8, dims1215);
+  const float expect_9[] = {0, 0, 0, 0, 0, 25, 25, 25, 25, 25};
+  SimpleTensorEltwise(eltwise::SQR_DIFF, input0_7, dims1215, input1_7,
+                      dims1215, output10, expect_9, dims1215);
+  const int input0_10[] = {1, 2, 3, 4, 5, 6};
+  const int expect_10[] = {1, 1, 1, 1, 1, 1};
+  SimpleTensorEltwise(eltwise::EQUAL, input0_10, dims1123, input0_10,
+                      dims1123, output6_i, expect_10, dims1123);
+  const float expect_11[] = {2, 2, 3, 3, 3, 2, 2, 3, 3, 3};
+  const float coeff_11[] = {2.0f, 3.0f};
+  SimpleTensorEltwise<float, float>(
+      eltwise::CLIP, input0_7, dims1215,
+      4, NULL, NULL, 0, output10, output_dims4, 4, expect_11, dims1215,
+      coeff_11, sizeof(coeff_11) / sizeof(float));
+}
+TEST_F(EltwiseOpTest, TensorGeneralBroadcastCPU) {
+  const int32_t dims1123[] = {1, 1, 2, 3};
+  const int32_t dims1121[] = {1, 1, 2, 1};
+  float output[10] = {0};
+  const float input0_0[] = {1, 2, 3, 4, 5, 6};
+  const float input1_0[] = {1, 2};
+  const float expect_0[] = {2, 3, 4, 6, 7, 8};
+  SimpleTensorEltwise(eltwise::SUM, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_0, dims1123);
+  const float expect_1[] = {0, 1, 2, 2, 3, 4};
+  SimpleTensorEltwise(eltwise::SUB, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_1, dims1123);
+  const float expect_2[] = {1, 2, 3, 8, 10, 12};
+  SimpleTensorEltwise(eltwise::PROD, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_2, dims1123);
+  const float expect_3[] = {1, 2, 3, 2, 2.5, 3};
+  SimpleTensorEltwise(eltwise::DIV, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_3, dims1123);
+  const float input1_4[] = {2, 3};
+  const float expect_4[] = {0, 1, 1, 1, 1, 2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_0, dims1123, input1_4,
+                      dims1121, output, expect_4, dims1123);
+  const float input1_5[] = {-2, -3};
+  const float expect_5[] = {-1, -1, -2, -2, -2, -2};
+  SimpleTensorEltwise(eltwise::FLOOR_DIV, input0_0, dims1123, input1_5,
+                      dims1121, output, expect_5, dims1123);
+  const float expect_6[] = {1, 1, 1, 2, 2, 2};
+  SimpleTensorEltwise(eltwise::MIN, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_6, dims1123);
+  const float expect_7[] = {1, 2, 3, 4, 5, 6};
+  SimpleTensorEltwise(eltwise::MAX, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_7, dims1123);
+  const float expect_8[] = {0, 1, 4, 4, 9, 16};
+  SimpleTensorEltwise(eltwise::SQR_DIFF, input0_0, dims1123, input1_0,
+                      dims1121, output, expect_8, dims1123);
+  const int32_t input0_9[] = {1, 2, 3, 4, 5, 6};
+  const int32_t input1_9[] = {1, 2};
+  const int32_t expect_9[] = {1, 0, 0, 0, 0, 0};
+  int32_t output_9[6] = {0};
+  SimpleTensorEltwise(eltwise::EQUAL, input0_9, dims1123, input1_9,
+                      dims1121, output_9, expect_9, dims1123);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/expand_dims_test.cc
+++ b/micro/test/ccunit/micro/ops/expand_dims_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/expand_dims.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ExpandDimsOpTest : public ::testing::Test {};
+namespace {
+void ExpandDimsSimpleA() {
+  MACE_DEFINE_RANDOM_INPUT(float, input, 6);
+  int32_t input_dims[3] = {3, 2, 1};
+  float output[6] = {0};
+  int32_t output_dims[4] = {0};
+  float *expect = input;
+  int32_t expect_dims[4] = {3, 1, 2, 1};
+  ExpandDimsOp expand_dims_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 3)
+      .AddArg("axis", 1)
+      .AddOutput(output, output_dims, 4);
+  expand_dims_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  expand_dims_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+void ExpandDimsSimpleB() {
+  MACE_DEFINE_RANDOM_INPUT(float, input, 6);
+  int32_t input_dims[3] = {1, 2, 3};
+  float output[6] = {0};
+  int32_t output_dims[4] = {0};
+  float *expect = input;
+  int32_t expect_dims[4] = {1, 2, 3, 1};
+  ExpandDimsOp expand_dims_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 3)
+      .AddArg("axis", -1)
+      .AddOutput(output, output_dims, 4);
+  expand_dims_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  expand_dims_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+}  // namespace
+TEST_F(ExpandDimsOpTest, ExpandDimsSimple) {
+  ExpandDimsSimpleA();
+  ExpandDimsSimpleB();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/matmul_test.cc
+++ b/micro/test/ccunit/micro/ops/matmul_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/matmul.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class MatMulOpTest : public ::testing::Test {};
+namespace {
+void Simple(
+    const float *input0, const int32_t *input0_dims,
+    const int32_t input0_dim_size,
+    const float *input1, const int32_t *input1_dims,
+    const int32_t input1_dim_size,
+    float *output, int32_t *output_dims, const int32_t output_dim_size,
+    const float *expect, const int32_t *expect_dims) {
+  MatMulOp mat_mul_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input0, input0_dims, input0_dim_size)
+      .AddInput(input1, input1_dims, input1_dim_size)
+      .AddOutput(output, output_dims, output_dim_size);
+  mat_mul_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  mat_mul_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size, 1e-5);
+}
+void Simple1() {
+  const float input0[6] = {1, 2, 3, 4, 5, 6};
+  const int32_t input0_dim_size = 3;
+  const int32_t input0_dims[input0_dim_size] = {1, 2, 3};
+  const float input1[6] = {1, 2, 3, 4, 5, 6};
+  const int32_t input1_dim_size = 3;
+  const int32_t input1_dims[input1_dim_size] = {1, 3, 2};
+  float output[6] = {0};
+  const int32_t output_dim_size = 3;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[4] = {22, 28, 49, 64};
+  const int32_t expect_dims[output_dim_size] = {1, 2, 2};
+  Simple(input0, input0_dims, input0_dim_size,
+         input1, input1_dims, input1_dim_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims);
+}
+void Simple2() {
+  const float input0[25] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                            14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25};
+  const int32_t input0_dim_size = 3;
+  const int32_t input0_dims[input0_dim_size] = {1, 5, 5};
+  const float input1[25] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                            14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25};
+  const int32_t input1_dim_size = 3;
+  const int32_t input1_dims[input1_dim_size] = {1, 5, 5};
+  float output[25] = {0};
+  const int32_t output_dim_size = 3;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[25] = {215, 230, 245, 260, 275, 490, 530, 570, 610,
+                            650, 765, 830, 895, 960, 1025, 1040, 1130, 1220,
+                            1310, 1400, 1315, 1430, 1545, 1660, 1775};
+  const int32_t expect_dims[output_dim_size] = {1, 5, 5};
+  Simple(input0, input0_dims, input0_dim_size,
+         input1, input1_dims, input1_dim_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims);
+}
+}  // namespace
+TEST_F(MatMulOpTest, SimpleCPU) {
+  Simple1();
+  Simple2();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/batch_norm_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/batch_norm_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/batch_norm.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class BatchNormOpTest : public ::testing::Test {};
+namespace {
+void TestBatchNormOp() {
+  float input[12] = {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+  int32_t input_dims[4] = {1, 6, 2, 1};
+  float scale[1] = {4.0f};
+  int32_t scale_dims[1] = {1};
+  float offset[1] = {2.0f};
+  int32_t offset_dims[1] = {1};
+  float mean[1] = {10};
+  int32_t mean_dims[1] = {1};
+  float var[1] = {11.67f};
+  int32_t var_dims[1] = {1};
+  float output[12] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[12] = {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291,
+                      3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543};
+  int32_t expect_dims[4] = {1, 6, 2, 1};
+  BatchNormOp batch_norm_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(scale, scale_dims, 1)
+      .AddInput(offset, offset_dims, 1)
+      .AddInput(mean, mean_dims, 1)
+      .AddInput(var, var_dims, 1)
+      .AddArg("epsilon", 1e-3)
+      .AddOutput(output, output_dims, 4);
+  batch_norm_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  batch_norm_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-4);
+}
+}  // namespace
+TEST_F(BatchNormOpTest, TestBatchNorm) {
+  TestBatchNormOp();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/conv_2d_opt_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/conv_2d_opt_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/conv_2d_c2_s4.h"
+#include "micro/ops/nhwc/conv_2d_c3_s4.h"
+#include "micro/ops/nhwc/conv_2d_c4_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class Conv2dOptOpTest : public ::testing::Test {};
+namespace {
+void TestNHWCMulti3x3SAME() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[72] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {4, 3, 3, 2};
+  float bias[4] = {0.1f, 0.1f, 0.1f, 0.1f};
+  int32_t bias_dims[1] = {4};
+  float output[36] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[36] = {8.1f, 8.1f, 8.1f, 8.1f,
+                      12.1f, 12.1f, 12.1f, 12.1f,
+                      8.1f, 8.1f, 8.1f, 8.1f,
+                      12.1f, 12.1f, 12.1f, 12.1f,
+                      18.1f, 18.1f, 18.1f, 18.1f,
+                      12.1f, 12.1f, 12.1f, 12.1f,
+                      8.1f, 8.1f, 8.1f, 8.1f,
+                      12.1f, 12.1f, 12.1f, 12.1f,
+                      8.1f, 8.1f, 8.1f, 8.1f};
+  int32_t expect_dims[4] = {1, 3, 3, 4};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dC4S4Op conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCMulti3x3NeqStride() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[36] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {2, 3, 3, 2};
+  float bias[2] = {0.1f, 0.1f};
+  int32_t bias_dims[1] = {2};
+  float output[12] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[12] = {
+      8.1f, 8.1f, 8.1f, 8.1f, 12.1f, 12.1f,
+      12.1f, 12.1f, 8.1f, 8.1f, 8.1f, 8.1f
+  };
+  int32_t expect_dims[4] = {1, 3, 2, 2};
+  const int32_t strides[] = {1, 2};
+  const int32_t dilations[] = {1, 1};
+  Conv2dC2S4Op conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWC3Multi3x3NeqStride() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[54] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {3, 3, 3, 2};
+  float bias[3] = {0.1f, 0.1f, 0.1f};
+  int32_t bias_dims[1] = {3};
+  float output[12] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[18] = {8.1f, 8.1f, 8.1f, 8.1f, 8.1f, 8.1f, 12.1f, 12.1f, 12.1f,
+                      12.1f, 12.1f, 12.1f, 8.1f, 8.1f, 8.1f, 8.1f, 8.1f, 8.1f};
+  int32_t expect_dims[4] = {1, 3, 2, 3};
+  const int32_t strides[] = {1, 2};
+  const int32_t dilations[] = {1, 1};
+  Conv2dC3S4Op conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCCombined3x3() {
+  float input[50] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 5, 5, 2};
+  float filter[36] =
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
+  int32_t filter_dims[4] = {2, 3, 3, 2};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[18] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[18] = {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
+                      9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f};
+  int32_t expect_dims[4] = {1, 3, 3, 2};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  Conv2dC2S4Op conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestConv1x1() {
+  float input[150] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 10, 5};
+  float filter[10] =
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f};
+  int32_t filter_dims[4] = {2, 1, 1, 5};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[60] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[60] = {
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f};
+  int32_t expect_dims[4] = {1, 3, 10, 2};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dC2S4Op conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+}  // namespace
+TEST_F(Conv2dOptOpTest, TestConv2dMultiSAME) {
+  TestNHWCMulti3x3SAME();
+}
+TEST_F(Conv2dOptOpTest, CPUStride2) {
+  TestNHWCCombined3x3();
+}
+TEST_F(Conv2dOptOpTest, CPUConv1x1) {
+  TestConv1x1();
+}
+TEST_F(Conv2dOptOpTest, TestNHWC3Multi3x3NeqStride) {
+  TestNHWCMulti3x3NeqStride();
+  TestNHWC3Multi3x3NeqStride();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/conv_2d_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/conv_2d_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/conv_2d_ref.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class Conv2dOpTest : public ::testing::Test {};
+namespace {
+void TestNHWCSimple3x3VALID() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[18] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {1, 3, 3, 2};
+  float bias[1] = {0.1f};
+  int32_t bias_dims[1] = {1};
+  float output[1] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[1] = {18.1f};
+  int32_t expect_dims[4] = {1, 1, 1, 1};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCSimple3x3SAME() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[18] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {1, 3, 3, 2};
+  float bias[1] = {0.1f};
+  int32_t bias_dims[1] = {1};
+  float output[9] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[9] = {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f};
+  int32_t expect_dims[4] = {1, 3, 3, 1};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCSimple3x3NeqStride() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[18] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {1, 3, 3, 2};
+  float bias[1] = {0.1f};
+  int32_t bias_dims[1] = {1};
+  float output[6] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[6] = {8.1f, 8.1f, 12.1f, 12.1f, 8.1f, 8.1f};
+  int32_t expect_dims[4] = {1, 3, 2, 1};
+  const int32_t strides[] = {1, 2};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCSimple3x3WithoutBias() {
+  float input[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[18] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {1, 3, 3, 2};
+  float output[1] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[1] = {18.0f};
+  int32_t expect_dims[4] = {1, 1, 1, 1};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestNHWCCombined3x3() {
+  float input[50] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 5, 5, 2};
+  float filter[36] =
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
+  int32_t filter_dims[4] = {2, 3, 3, 2};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[18] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[18] = {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
+                      9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f};
+  int32_t expect_dims[4] = {1, 3, 3, 2};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestFusedNHWCSimple3x3VALID(bool need_bias) {
+  float input[18] =
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[18] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  int32_t filter_dims[4] = {1, 3, 3, 2};
+  float bias[1] = {-0.1f};
+  int32_t bias_dims[1] = {1};
+  float output[1] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[1] = {0.0f};
+  int32_t expect_dims[4] = {1, 1, 1, 1};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  const char activation[] = "RELU";
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddRepeatArg("activation", activation, sizeof(activation))
+      .AddOutput(output, output_dims, 4);
+  if (need_bias) {
+    substitude_op.AddInput(bias, bias_dims, 1);
+  }
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestConv1x1() {
+  float input[150] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  int32_t input_dims[4] = {1, 3, 10, 5};
+  float filter[10] =
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f};
+  int32_t filter_dims[4] = {2, 1, 1, 5};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[60] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[60] = {
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+      5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f};
+  int32_t expect_dims[4] = {1, 3, 10, 2};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  Conv2dRefOp conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+}  // namespace
+TEST_F(Conv2dOpTest, TestConv2dVALID) {
+  TestNHWCSimple3x3VALID();
+}
+TEST_F(Conv2dOpTest, TestConv2dSAME) {
+  TestNHWCSimple3x3SAME();
+}
+TEST_F(Conv2dOpTest, NotEqualStrideSimple) {
+  TestNHWCSimple3x3NeqStride();
+}
+TEST_F(Conv2dOpTest, CPUWithoutBias) {
+  TestNHWCSimple3x3WithoutBias();
+}
+TEST_F(Conv2dOpTest, CPUStride2) {
+  TestNHWCCombined3x3();
+}
+TEST_F(Conv2dOpTest, FusedCPUSimple) {
+  TestFusedNHWCSimple3x3VALID(true);
+  TestFusedNHWCSimple3x3VALID(false);
+}
+TEST_F(Conv2dOpTest, CPUConv1x1) {
+  TestConv1x1();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/depthwise_conv_2d_opt_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/depthwise_conv_2d_opt_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb2_s4.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb3_s4.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class DepthwiseConv2dOptOpTest : public ::testing::Test {};
+namespace {
+void SimpleValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10,
+                     6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[8] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {1, 2, 2, 2};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[8] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[8] = {37.1f, 148.2f, 47.1f, 188.2f,
+                     67.1f, 268.2f, 77.1f, 308.2f};
+  int32_t expect_dims[4] = {1, 2, 2, 2};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dKB1S4Op depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void MultiKB2ValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[16] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {2, 2, 2, 2};
+  float bias[4] = {0.1f, 0.1f, 0.2f, 0.2f};
+  int32_t bias_dims[1] = {4};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] = {37.1f, 37.1f, 148.2f, 148.2f,
+                      47.1f, 47.1f, 188.2f, 188.2f,
+                      67.1f, 67.1f, 268.2f, 268.2f,
+                      77.1f, 77.1f, 308.2f, 308.2f};
+  int32_t expect_dims[4] = {1, 2, 2, 4};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dKB2S4Op depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void MultiKB3ValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[24] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {3, 2, 2, 2};
+  float bias[6] = {0.1f, 0.1f, 0.1f, 0.2f, 0.2f, 0.2f};
+  int32_t bias_dims[1] = {6};
+  float output[24] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[24] = {37.1f, 37.1f, 37.1f, 148.2f, 148.2f, 148.2f,
+                      47.1f, 47.1f, 47.1f, 188.2f, 188.2f, 188.2f,
+                      67.1f, 67.1f, 67.1f, 268.2f, 268.2f, 268.2f,
+                      77.1f, 77.1f, 77.1f, 308.2f, 308.2f, 308.2f};
+  int32_t expect_dims[4] = {1, 2, 2, 6};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dKB3S4Op depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void MultiKB4ValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[32] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {4, 2, 2, 2};
+  float bias[8] = {0.1f, 0.1f, 0.1f, 0.1f, 0.2f, 0.2f, 0.2f, 0.2f};
+  int32_t bias_dims[1] = {8};
+  float output[32] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[32] = {
+      37.1f, 37.1f, 37.1f, 37.1f, 148.2f, 148.2f, 148.2f, 148.2f,
+      47.1f, 47.1f, 47.1f, 47.1f, 188.2f, 188.2f, 188.2f, 188.2f,
+      67.1f, 67.1f, 67.1f, 67.1f, 268.2f, 268.2f, 268.2f, 268.2f,
+      77.1f, 77.1f, 77.1f, 77.1f, 308.2f, 308.2f, 308.2f, 308.2f};
+  int32_t expect_dims[4] = {1, 2, 2, 8};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dKB4S4Op depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void MultiKB5ValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[40] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {5, 2, 2, 2};
+  float bias[10] = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
+  int32_t bias_dims[1] = {10};
+  float output[40] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[40] = {
+      37.1f, 37.1f, 37.1f, 37.1f, 37.1f,
+      148.2f, 148.2f, 148.2f, 148.2f, 148.2f,
+      47.1f, 47.1f, 47.1f, 47.1f, 47.1f,
+      188.2f, 188.2f, 188.2f, 188.2f, 188.2f,
+      67.1f, 67.1f, 67.1f, 67.1f, 67.1f,
+      268.2f, 268.2f, 268.2f, 268.2f, 268.2f,
+      77.1f, 77.1f, 77.1f, 77.1f, 77.1f,
+      308.2f, 308.2f, 308.2f, 308.2f, 308.2f
+  };
+  int32_t expect_dims[4] = {1, 2, 2, 10};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dKB4S4Op depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+}  // namespace
+TEST_F(DepthwiseConv2dOptOpTest, MultiKB1CPU) {
+  SimpleValidTest();
+}
+TEST_F(DepthwiseConv2dOptOpTest, MultiKB2CPU) {
+  MultiKB2ValidTest();
+}
+TEST_F(DepthwiseConv2dOptOpTest, MultiKB3CPU) {
+  MultiKB3ValidTest();
+}
+TEST_F(DepthwiseConv2dOptOpTest, MultiKB4CPU) {
+  MultiKB4ValidTest();
+}
+TEST_F(DepthwiseConv2dOptOpTest, MultiKB5CPU) {
+  MultiKB5ValidTest();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/depthwise_conv_2d_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/depthwise_conv_2d_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/depthwise_conv_2d_ref.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class DepthwiseConv2dOpTest : public ::testing::Test {};
+namespace {
+void SimpleValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[8] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {1, 2, 2, 2};
+  float bias[2] = {0.1f, 0.2f};
+  int32_t bias_dims[1] = {2};
+  float output[8] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[8] = {37.1f, 148.2f, 47.1f, 188.2f,
+                     67.1f, 268.2f, 77.1f, 308.2f};
+  int32_t expect_dims[4] = {1, 2, 2, 2};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dRefOp depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims,
+                          4, expect, expect_dims, 4, 1e-5);
+}
+void MultiC2ValidTest() {
+  float input[18] = {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18};
+  int32_t input_dims[4] = {1, 3, 3, 2};
+  float filter[16] = {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f,
+                      1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f};
+  int32_t filter_dims[4] = {2, 2, 2, 2};
+  float bias[4] = {0.1f, 0.1f, 0.2f, 0.2f};
+  int32_t bias_dims[1] = {4};
+  float output[16] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[16] = {
+      37.1f, 37.1f, 148.2f, 148.2f, 47.1f, 47.1f, 188.2f, 188.2f,
+      67.1f, 67.1f, 268.2f, 268.2f, 77.1f, 77.1f, 308.2f, 308.2f
+  };
+  int32_t expect_dims[4] = {1, 2, 2, 4};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {1, 1};
+  DepthwiseConv2dRefOp depthwise_conv_2d_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddInput(filter, filter_dims, 4)
+      .AddInput(bias, bias_dims, 1)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  depthwise_conv_2d_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  depthwise_conv_2d_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+}  // namespace
+TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
+  SimpleValidTest();
+}
+TEST_F(DepthwiseConv2dOpTest, MuiltiC2CPU) {
+  MultiC2ValidTest();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/nhwc/pooling_test.cc
+++ b/micro/test/ccunit/micro/ops/nhwc/pooling_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/nhwc/pooling_ref.h"
+#include "micro/ops/nhwc/pooling_s4.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class PoolingOpTest : public ::testing::Test {};
+namespace {
+void TestPoolingOpValidMax() {
+  float input[32] = {
+      0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+      8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
+  int32_t input_dims[4] = {1, 4, 4, 2};
+  float output[8] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[8] = {5, 21, 7, 23, 13, 29, 15, 31};
+  int32_t expect_dims[4] = {1, 2, 2, 2};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  const int32_t kernels[] = {2, 2};
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddArg("pooling_type", PoolingType::MAX)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  pooling_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestPoolingOpSameMax() {
+  float input[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+  int32_t input_dims[4] = {1, 3, 3, 1};
+  float output[4] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[4] = {4, 5, 7, 8};
+  int32_t expect_dims[4] = {1, 2, 2, 1};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  const int32_t kernels[] = {2, 2};
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddArg("pooling_type", PoolingType::MAX)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  pooling_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestPoolingOpValidDilation() {
+  float input[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  int32_t input_dims[4] = {1, 4, 4, 1};
+  float output[4] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[4] = {10, 11, 14, 15};
+  int32_t expect_dims[4] = {1, 2, 2, 1};
+  const int32_t strides[] = {1, 1};
+  const int32_t dilations[] = {2, 2};
+  const int32_t kernels[] = {2, 2};
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddArg("pooling_type", PoolingType::MAX)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  pooling_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestPoolingOpValidAvg() {
+  float input[32] = {
+      0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+      8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
+  int32_t input_dims[4] = {1, 4, 4, 2};
+  float output[8] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[8] = {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5};
+  int32_t expect_dims[4] = {1, 2, 2, 2};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  const int32_t kernels[] = {2, 2};
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddArg("padding", Padding::VALID)
+      .AddArg("pooling_type", PoolingType::AVG)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  pooling_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+void TestPoolingOpSameAvg() {
+  float input[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  int32_t input_dims[4] = {1, 2, 8, 1};
+  float output[4] = {0};
+  int32_t output_dims[4] = {0};
+  float expect[4] = {4.5, 6.5, 8.5, 10.5};
+  int32_t expect_dims[4] = {1, 1, 4, 1};
+  const int32_t strides[] = {2, 2};
+  const int32_t dilations[] = {1, 1};
+  const int32_t kernels[] = {2, 2};
+  PoolingS4Op pooling_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, 4)
+      .AddRepeatArg("strides", strides, sizeof(strides) / sizeof(int32_t))
+      .AddRepeatArg("kernels", kernels, sizeof(kernels) / sizeof(int32_t))
+      .AddArg("padding", Padding::SAME)
+      .AddArg("pooling_type", PoolingType::AVG)
+      .AddRepeatArg("dilations", dilations, sizeof(dilations) / sizeof(int32_t))
+      .AddOutput(output, output_dims, 4);
+  pooling_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  pooling_op.Run();
+  ExpectTensorNear<float>(output, output_dims, 4, expect, expect_dims, 4, 1e-5);
+}
+}  // namespace
+TEST_F(PoolingOpTest, TestPoolingValidMax) {
+  TestPoolingOpValidMax();
+}
+TEST_F(PoolingOpTest, TestPoolingSameMax) {
+  TestPoolingOpSameMax();
+}
+TEST_F(PoolingOpTest, TestPoolingValidDilation) {
+  TestPoolingOpValidDilation();
+}
+TEST_F(PoolingOpTest, TestPoolingOpValidAvg) {
+  TestPoolingOpValidAvg();
+}
+TEST_F(PoolingOpTest, TestPoolingOpSameAvg) {
+  TestPoolingOpSameAvg();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/reduce_test.cc
+++ b/micro/test/ccunit/micro/ops/reduce_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/reduce.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ReduceOpTest : public ::testing::Test {};
+namespace {
+typedef ReduceOpBase::ReduceType ReduceType;
+void Simple(
+    const float *input, const int32_t *input_dims,
+    const int32_t input_dim_size,
+    const int32_t *axis, const int32_t axis_size,
+    float *output, int32_t *output_dims, const int32_t output_dim_size,
+    const float *expect, const int32_t *expect_dims,
+    ReduceType type, const bool keepdims = true) {
+  ReduceOp<float> reduce_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddRepeatArg("axis", axis, axis_size)
+      .AddArg("keepdims", keepdims ? 1 : 0)
+      .AddArg("reduce_type", static_cast<int32_t>(type))
+      .AddOutput(output, output_dims, output_dim_size);
+  reduce_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  reduce_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size, 1e-5, 1e-3);
+}
+void SimpleMean12Test() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 2;
+  const int32_t axis[axis_size] = {1, 2};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 1, 4};
+  const float expect[8] = {10, 11, 12, 13, 10, 11, 12, 13};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[8] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MEAN);
+}
+void SimpleMin12Test() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 2;
+  const int32_t axis[axis_size] = {1, 2};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 1, 4};
+  const float expect[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[8] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MIN);
+}
+void SimpleMax12Test() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 2;
+  const int32_t axis[axis_size] = {1, 2};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 1, 4};
+  const float expect[8] = {20, 21, 22, 23, 20, 21, 22, 23};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[8] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MAX);
+}
+void SimpleMean1Axis() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 1;
+  const int32_t axis[axis_size] = {1};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 3, 4};
+  const float expect[24] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+                            6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[24] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MEAN);
+}
+void SimpleMin1Axis() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 1;
+  const int32_t axis[axis_size] = {1};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 3, 4};
+  const float expect[24] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[24] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MIN);
+}
+void SimpleMax1Axis() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 1;
+  const int32_t axis[axis_size] = {1};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {2, 1, 3, 4};
+  const float expect[24] = {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[24] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MAX);
+}
+void Simple2Axis() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {1, 2, 3, 4};
+  const float input[24] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 2;
+  const int32_t axis[axis_size] = {0, 1};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {1, 1, 3, 4};
+  const float expect[12] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[12] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MEAN);
+  const int32_t input1_dim_size = 3;
+  const int32_t input1_dims[input1_dim_size] = {2, 3, 4};
+  const int32_t axis1[axis_size] = {1, 2};
+  const int32_t output1_dim_size = 3;
+  const int32_t expect1_dims[output1_dim_size] = {2, 1, 1};
+  const float expect1[2] = {5.5, 17.5};
+  int32_t output1_dims[output_dim_size] = {0};
+  float output1[2] = {0};
+  Simple(input, input1_dims, input1_dim_size, axis1, axis_size,
+         output1, output1_dims, output1_dim_size,
+         expect1, expect1_dims, ReduceOpBase::MEAN);
+  const int32_t axis2[axis_size] = {0, 2};
+  const int32_t expect2_dims[output_dim_size] = {1, 2, 1, 4};
+  const float expect2[8] = {4, 5, 6, 7, 16, 17, 18, 19};
+  Simple(input, input_dims, input_dim_size, axis2, axis_size,
+         output, output_dims, output_dim_size,
+         expect2, expect2_dims, ReduceOpBase::MEAN);
+}
+void Simple3Axis() {
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {1, 2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 3;
+  const int32_t axis[axis_size] = {1, 2, 3};
+  const int32_t output_dim_size = 4;
+  const int32_t expect_dims[output_dim_size] = {1, 1, 1, 1};
+  const float expect[1] = {11.5};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[1] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MEAN);
+}
+void CPUSimpleReduceDims() {
+  const int32_t input_dim_size = 3;
+  const int32_t input_dims[input_dim_size] = {2, 3, 4};
+  const float input[48] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                           12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t axis_size = 2;
+  const int32_t axis[axis_size] = {0, 1};
+  const int32_t output_dim_size = 1;
+  const int32_t expect_dims[output_dim_size] = {4};
+  const float expect[4] = {10, 11, 12, 13};
+  int32_t output_dims[output_dim_size] = {0};
+  float output[4] = {0};
+  Simple(input, input_dims, input_dim_size, axis, axis_size,
+         output, output_dims, output_dim_size,
+         expect, expect_dims, ReduceOpBase::MEAN, false);
+}
+}  // namespace
+TEST_F(ReduceOpTest, CPUSimple12) {
+  SimpleMean12Test();
+  SimpleMin12Test();
+  SimpleMax12Test();
+}
+TEST_F(ReduceOpTest, CPUSimple1Axis) {
+  SimpleMean1Axis();
+  SimpleMin1Axis();
+  SimpleMax1Axis();
+}
+TEST_F(ReduceOpTest, CPUSimple2Axis) {
+  Simple2Axis();
+}
+TEST_F(ReduceOpTest, CPUSimple3Axis) {
+  Simple3Axis();
+}
+TEST_F(ReduceOpTest, CPUSimpleReduceDims) {
+  CPUSimpleReduceDims();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/reshape_test.cc
+++ b/micro/test/ccunit/micro/ops/reshape_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/reshape.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ReshapeOpTest : public ::testing::Test {};
+namespace {
+template<typename T>
+void TestReshapeOp(
+    const T *input, const int32_t *input_dims, const uint32_t input_dim_size,
+    const int32_t *shape, const int32_t *shape_dims,
+    T *y, int32_t *y_dims, const uint32_t y_dim_size,
+    const T *e, const int32_t *e_dims, const uint32_t e_dim_size) {
+  ReshapeOp reshape_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddInput(shape, shape_dims, 1)
+      .AddOutput(y, y_dims, y_dim_size);
+  reshape_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  reshape_op.Run();
+  ExpectTensorNear<T>(y, y_dims, y_dim_size, e, e_dims, e_dim_size);
+}
+}  // namespace
+TEST_F(ReshapeOpTest, TestReshape) {
+  MACE_DEFINE_RANDOM_INPUT(float, x, 6);
+  int32_t x_dims[3] = {1, 2, 3};
+  int32_t shape[2] = {3, 2};
+  int32_t shape_dims[1] = {2};
+  float y[6] = {0};
+  int32_t y_dims[2] = {0};
+  int32_t e_dims[2] = {3, 2};
+  TestReshapeOp(x, x_dims, 3, shape, shape_dims,
+                y, y_dims, 2, x, e_dims, 2);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/shape_test.cc
+++ b/micro/test/ccunit/micro/ops/shape_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/shape.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class ShapeOpTest : public ::testing::Test {};
+namespace {
+template<typename EXP_TYPE, typename RES_TYPE>
+void TestShapeOp(
+    const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+    RES_TYPE *y, int32_t *y_dims, const uint32_t y_dim_size,
+    const RES_TYPE *e, const int32_t *e_dims, const uint32_t e_dim_size) {
+  ShapeOp shape_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x, x_dims, x_dim_size)
+      .AddOutput(y, y_dims, y_dim_size);
+  shape_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  shape_op.Run();
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, e, e_dims, e_dim_size);
+}
+}  // namespace
+TEST_F(ShapeOpTest, TestShape) {
+  MACE_DEFINE_RANDOM_INPUT(float, x, 6);
+  int32_t x_dims[3] = {1, 2, 3};
+  int32_t y[3] = {0};
+  int32_t y_dims[1] = {0};
+  int32_t e[3] = {1, 2, 3};
+  int32_t e_dims[1] = {3};
+  TestShapeOp(x, x_dims, 3, y, y_dims, 1, e, e_dims, 1);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/softmax_test.cc
+++ b/micro/test/ccunit/micro/ops/softmax_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/softmax.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class SoftmaxOpTest : public ::testing::Test {};
+namespace {
+void Simple(bool use_log = false) {
+  const float input[8] = {1, 1, 1, 1, 1, 2, 3, 4};
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {1, 1, 2, 4};
+  float output[8] = {0};
+  const int32_t output_dim_size = 4;
+  int32_t output_dims[output_dim_size] = {0};
+  const int32_t expect_dims[output_dim_size] = {1, 1, 2, 4};
+  float expected_data1[8] = {-1.3862944, -1.3862944, -1.3862944, -1.3862944,
+                             -3.4401896, -2.4401896, -1.4401897, -0.44018975};
+  float expected_data2[8] = {0.25, 0.25, 0.25, 0.25,
+                             0.0320586, 0.08714432, 0.23688282, 0.6439142};
+  float *expect = use_log ? expected_data1 : expected_data2;
+  SoftmaxOp softmax_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddArg("use_log", static_cast<int>(use_log))
+      .AddOutput(output, output_dims, output_dim_size);
+  softmax_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  softmax_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size, 1e-5);
+}
+}  // namespace
+TEST_F(SoftmaxOpTest, CPUSimple) { Simple(); }
+TEST_F(SoftmaxOpTest, CPUSimpleUseLog) { Simple(true); }
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/squeeze_test.cc
+++ b/micro/test/ccunit/micro/ops/squeeze_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/squeeze.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class SqueezeOpTest : public ::testing::Test {};
+namespace {
+void TestSqueeze(
+    const float *input, const int32_t *input_dims,
+    const int32_t input_dim_size,
+    const int32_t *axis,
+    const int32_t axis_size,
+    float *output, int32_t *output_dims, const int32_t output_dim_size,
+    const float *expect, const int32_t *expect_dims) {
+  SqueezeOp squeeze_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddOutput(output, output_dims, output_dim_size);
+  if (axis != NULL && axis_size > 0) {
+    substitude_op.AddRepeatArg("axis", axis, axis_size);
+  }
+  squeeze_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  squeeze_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size, 1e-5);
+}
+void TestSqueeze() {
+  MACE_DEFINE_RANDOM_INPUT(float, input, 8);
+  const int32_t dims1214[] = {1, 2, 1, 4};
+  const int32_t dims24[] = {2, 4};
+  const int32_t dims124[] = {1, 2, 4};
+  const int32_t dims1411[] = {1, 4, 1, 1};
+  const int32_t dims141[] = {1, 4, 1};
+  float output[8] = {0};
+  int32_t output_dims[10] = {0};
+  TestSqueeze(input, dims1214, 4, NULL, 0,
+              output, output_dims, 2, input, dims24);
+  int32_t axis_size = 1;
+  int32_t axis[] = {1};
+  TestSqueeze(input, dims1214, 4, axis, axis_size,
+              output, output_dims, 4, input, dims1214);
+  int32_t axis2[] = {2};
+  TestSqueeze(input, dims1214, 4, axis2, axis_size,
+              output, output_dims, 3, input, dims124);
+  MACE_DEFINE_RANDOM_INPUT(float, input3, 4);
+  int32_t axis3[2] = {1, 2};
+  TestSqueeze(input, dims1411, 4, axis3, 2,
+              output, output_dims, 3, input, dims141);
+}
+}  // namespace
+TEST_F(SqueezeOpTest, TestSqueeze) {
+  TestSqueeze();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/stack_test.cc
+++ b/micro/test/ccunit/micro/ops/stack_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/stack.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class StackOpTest : public ::testing::Test {};
+namespace {
+void TestStack(
+    const float **inputs, const int32_t inputs_size, const int32_t *input_dims,
+    const int32_t input_dim_size, int axis,
+    float *output, int32_t *output_dims, const int32_t output_dim_size,
+    const float *expect, const int32_t *expect_dims) {
+  StackOp<float> stack_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddArg("axis", axis)
+      .AddOutput(output, output_dims, output_dim_size);
+  for (int32_t i = 0; i < inputs_size; ++i) {
+    substitude_op.AddInput(inputs[i], input_dims, input_dim_size);
+  }
+  stack_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  stack_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size, 1e-5);
+}
+void TestStackScalar() {
+  const float input0[1] = {1};
+  const float input1[1] = {2};
+  const float input2[1] = {3};
+  const int32_t axis = 0;
+  float output[3] = {0};
+  const int32_t output_dim_size = 1;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[3] = {1, 2, 3};
+  const int32_t expect_dims[output_dim_size] = {3};
+  const float *inputs[] = {input0, input1, input2};
+  TestStack(inputs, 3, NULL, 0, axis,
+            output, output_dims, output_dim_size, expect, expect_dims);
+}
+void TestStackVector() {
+  const float input0[] = {1, 4};
+  const float input1[] = {2, 5};
+  const float input2[] = {3, 6};
+  const int32_t input_dim_size = 1;
+  const int32_t input_dims[input_dim_size] = {2};
+  int32_t axis = 0;
+  float output[6] = {0};
+  const int32_t output_dim_size = 2;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[6] = {1, 4, 2, 5, 3, 6};
+  const int32_t expect_dims[output_dim_size] = {3, 2};
+  const float *inputs[] = {input0, input1, input2};
+  TestStack(inputs, 3, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect, expect_dims);
+  axis = -2;
+  TestStack(inputs, 3, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect, expect_dims);
+  axis = -1;
+  const float expect2[6] = {1, 2, 3, 4, 5, 6};
+  const int32_t expect_dims2[output_dim_size] = {2, 3};
+  TestStack(inputs, 3, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect2, expect_dims2);
+}
+void TestStackHighRank() {
+  const float input0[] = {1, 2, 3, 4, 5, 6};
+  const float input1[] = {7, 8, 9, 10, 11, 12};
+  const int32_t input_dim_size = 2;
+  const int32_t input_dims[input_dim_size] = {2, 3};
+  int32_t axis = -3;
+  float output[12] = {0};
+  const int32_t output_dim_size = 3;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  const int32_t expect_dims[output_dim_size] = {2, 2, 3};
+  const float *inputs[] = {input0, input1};
+  TestStack(inputs, 2, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect, expect_dims);
+  axis = 1;
+  const float expect1[12] = {1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12};
+  TestStack(inputs, 2, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect1, expect_dims);
+  axis = 2;
+  const int32_t expect_dims2[output_dim_size] = {2, 3, 2};
+  const float expect2[12] = {1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12};
+  TestStack(inputs, 2, input_dims, input_dim_size, axis,
+            output, output_dims, output_dim_size, expect2, expect_dims2);
+}
+}  // namespace
+TEST_F(StackOpTest, TestStackScalar) {
+  TestStackScalar();
+}
+TEST_F(StackOpTest, TestStackVector) {
+  TestStackVector();
+}
+TEST_F(StackOpTest, TestStackHighRank) {
+  TestStackHighRank();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccunit/micro/ops/strided_slice_test.cc
+++ b/micro/test/ccunit/micro/ops/strided_slice_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/strided_slice.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+class StridedSliceOpTest : public ::testing::Test {};
+namespace {
+void TestStridedSlice(
+    const float *input, const int32_t *input_dims, const int32_t input_dim_size,
+    const int32_t *begin_indices, const int32_t *end_indices,
+    const int32_t *strides,
+    const int32_t *indices_dims, const int32_t indices_dim_size,
+    const int32_t begin_mask, const int32_t end_mask,
+    const int32_t ellipsis_mask, const int32_t new_axis_mask,
+    const int32_t shrink_axis_mask, const int32_t output_dim_size,
+    float *output, int32_t *output_dims,
+    const float *expect, const int32_t *expect_dims) {
+  StridedSliceOp<float> strided_slice_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddInput(begin_indices, indices_dims, indices_dim_size)
+      .AddInput(end_indices, indices_dims, indices_dim_size)
+      .AddInput(strides, indices_dims, indices_dim_size)
+      .AddArg("begin_mask", begin_mask)
+      .AddArg("end_mask", end_mask)
+      .AddArg("ellipsis_mask", ellipsis_mask)
+      .AddArg("new_axis_mask", new_axis_mask)
+      .AddArg("shrink_axis_mask", shrink_axis_mask)
+      .AddOutput(output, output_dims, output_dim_size);
+  strided_slice_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  strided_slice_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size);
+}
+void TestSlice(
+    const float *input, const int32_t *input_dims, const int32_t input_dim_size,
+    const int32_t *begin_indices, const int32_t *indice_sizes,
+    const int32_t *indices_dims, const int32_t indices_dim_size,
+    float *output, int32_t *output_dims, const int32_t output_dim_size,
+    const float *expect, const int32_t *expect_dims) {
+  StridedSliceOp<float> strided_slice_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(input, input_dims, input_dim_size)
+      .AddInput(begin_indices, indices_dims, indices_dim_size)
+      .AddInput(indice_sizes, indices_dims, indices_dim_size)
+      .AddArg("slice", 1)
+      .AddOutput(output, output_dims, output_dim_size);
+  strided_slice_op.Init(NULL, reinterpret_cast<framework::OpContext *>(
+      &substitude_op), NULL);
+  strided_slice_op.Run();
+  ExpectTensorNear<float>(output, output_dims, output_dim_size,
+                          expect, expect_dims, output_dim_size);
+}
+void TestStridedSliceByFirstAxis() {
+  const float input[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  const int32_t begin_indices[] = {1, 0, 0};
+  const int32_t end_indices[] = {2, 3, 2};
+  const int32_t strides[] = {1, 1, 1};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {3};
+  const int32_t input_dim_size = 3;
+  const int32_t input_dims[input_dim_size] = {2, 3, 2};
+  float output[6] = {0};
+  const int32_t output_dim_size = 3;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[6] = {7, 8, 9, 10, 11, 12};
+  const int32_t expect_dims[output_dim_size] = {1, 3, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const int32_t output_dim_size1 = 2;
+  int32_t output_dims1[output_dim_size1] = {0};
+  const int32_t expect_dims1[output_dim_size1] = {3, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 1, output_dim_size1,
+                   output, output_dims1, expect, expect_dims1);
+  const int32_t begin_indices2[] = {1, 1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   6, 6, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+}
+void TestStridedSliceRank1() {
+  const float input[] = {1, 2, 3, 4};
+  const int32_t begin_indices[] = {1};
+  const int32_t end_indices[] = {3};
+  const int32_t strides[] = {1};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {1};
+  const int32_t input_dim_size = 1;
+  const int32_t input_dims[input_dim_size] = {4};
+  float output[4] = {0};
+  const int32_t output_dim_size = 1;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[2] = {2, 3};
+  const int32_t expect_dims[output_dim_size] = {2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const int32_t begin_indices1[] = {-3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices1, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const int32_t begin_indices2[] = {-2};
+  const int32_t end_indices2[] = {-4};
+  const int32_t strides2[] = {-1};
+  const float expect2[2] = {3, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices2, strides2,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect2, expect_dims);
+  const int32_t begin_indices3[] = {-1};
+  const int32_t strides3[] = {-2};
+  const float expect3[2] = {4, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices3, end_indices2, strides3,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect3, expect_dims);
+  const int32_t begin_indices4[] = {-1};
+  const int32_t strides4[] = {-2};
+  const float expect4[2] = {4, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices4, end_indices2, strides4,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect4, expect_dims);
+  const float expect5[3] = {4, 3, 2};
+  const int32_t expect_dims5[output_dim_size] = {3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices2, strides2,
+                   indices_dims, indices_dim_size,
+                   1, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect5, expect_dims5);
+  const float expect6[3] = {3, 2, 1};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices2, strides2,
+                   indices_dims, indices_dim_size,
+                   0, 1, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect6, expect_dims5);
+  const float expect7[4] = {4, 3, 2, 1};
+  const int32_t expect_dims7[output_dim_size] = {4};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices2, strides2,
+                   indices_dims, indices_dim_size,
+                   1, 1, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect7, expect_dims7);
+  const int32_t begin_indices8[] = {2};
+  const int32_t end_indices8[] = {4};
+  const int32_t strides8[] = {2};
+  const float expect8[2] = {1, 3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices8, end_indices8, strides8,
+                   indices_dims, indices_dim_size,
+                   1, 1, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect8, expect_dims);
+  const int32_t output_dim_size9 = 0;
+  int32_t output_dims9[] = {1};
+  const float expect9[] = {3};
+  const int32_t *expect_dims9 = NULL;
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices8, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 1, output_dim_size9,
+                   output, output_dims9, expect9, expect_dims9);
+}
+void TestStridedSliceRank2() {
+  const float input[] = {1, 2, 3, 4, 5, 6};
+  const int32_t begin_indices[] = {0, 0};
+  const int32_t end_indices[] = {2, 3};
+  const int32_t strides[] = {1, 1};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {2};
+  const int32_t input_dim_size = 2;
+  const int32_t input_dims[input_dim_size] = {2, 3};
+  float output[6] = {0};
+  const int32_t output_dim_size = 2;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[6] = {1, 2, 3, 4, 5, 6};
+  const int32_t expect_dims[output_dim_size] = {2, 3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const int32_t begin_indices1[] = {0};
+  const int32_t end_indices1[] = {2};
+  const int32_t strides1[] = {1};
+  const int32_t indices_dims1[indices_dim_size] = {1};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices1, end_indices1, strides1,
+                   indices_dims1, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const int32_t begin_indices2[] = {1, 1};
+  const float expect2[2] = {5, 6};
+  const int32_t expect_dims2[output_dim_size] = {1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices2, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect2, expect_dims2);
+  const int32_t strides3[] = {1, 2};
+  const float expect3[4] = {1, 3, 4, 6};
+  const int32_t expect_dims3[output_dim_size] = {2, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides3,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect3, expect_dims3);
+  const int32_t begin_indices4[] = {1, 2};
+  const int32_t end_indices4[] = {0, 0};
+  const int32_t strides4[] = {-1, -1};
+  const float expect4[2] = {6, 5};
+  const int32_t expect_dims4[output_dim_size] = {1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices4, end_indices4, strides4,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect4, expect_dims4);
+  const float expect5[6] = {6, 5, 4, 3, 2, 1};
+  const int32_t expect_dims5[output_dim_size] = {2, 3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices4, end_indices4, strides4,
+                   indices_dims, indices_dim_size,
+                   3, 3, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect5, expect_dims5);
+  const int32_t begin_indices6[] = {1, 0};
+  const int32_t end_indices6[] = {2, 3};
+  const int32_t strides6[] = {1, 1};
+  const float expect6[3] = {4, 5, 6};
+  const int32_t output_dim_size6 = 1;
+  const int32_t expect_dims6[output_dim_size6] = {3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices6, end_indices6, strides6,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 1, output_dim_size6,
+                   output, output_dims, expect6, expect_dims6);
+  const int32_t begin_indices7[] = {1, 2};
+  const float expect7[1] = {6};
+  const int32_t output_dim_size7 = 0;
+  const int32_t *expect_dims7 = NULL;
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices7, end_indices6, strides6,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 3, output_dim_size7,
+                   output, output_dims, expect7, expect_dims7);
+}
+void TestStridedSliceRank3() {
+  const float input[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  const int32_t begin_indices[] = {0, 0, 0};
+  const int32_t end_indices[] = {2, 3, 2};
+  const int32_t strides[] = {1, 2, 1};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {3};
+  const int32_t input_dim_size = 3;
+  const int32_t input_dims[input_dim_size] = {2, 3, 2};
+  float output[8] = {0};
+  const int32_t output_dim_size = 3;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[8] = {1, 2, 5, 6, 7, 8, 11, 12};
+  const int32_t expect_dims[output_dim_size] = {2, 2, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const float input1[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  const int32_t begin_indices1[] = {1, 0, 0};
+  const int32_t end_indices1[] = {2, 1, 3};
+  const int32_t strides1[] = {1, 1, 1};
+  const int32_t input_dims1[input_dim_size] = {3, 2, 3};
+  const float expect1[3] = {3, 3, 3};
+  const int32_t expect_dims1[output_dim_size] = {1, 1, 3};
+  TestStridedSlice(input1, input_dims1, input_dim_size,
+                   begin_indices1, end_indices1, strides1,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect1, expect_dims1);
+  const int32_t begin_indices2[] = {0, 0, 0};
+  const int32_t end_indices2[] = {2, 2, 2};
+  const int32_t strides2[] = {1, 2, 1};
+  const float expect2[4] = {1, 1, 3, 3};
+  const int32_t expect_dims2[output_dim_size] = {2, 1, 2};
+  TestStridedSlice(input1, input_dims1, input_dim_size,
+                   begin_indices2, end_indices2, strides2,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect2, expect_dims2);
+}
+void TestStridedSliceRank4() {
+  const float input[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                         14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  const int32_t begin_indices[] = {1, 0, 1, 0};
+  const int32_t end_indices[] = {2, 2, 2, 2};
+  const int32_t strides[] = {1, 1, 1, 1};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {4};
+  const int32_t input_dim_size = 4;
+  const int32_t input_dims[input_dim_size] = {2, 2, 2, 3};
+  float output[8] = {0};
+  const int32_t output_dim_size = 4;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[8] = {15, 16, 21, 22};
+  const int32_t expect_dims[output_dim_size] = {1, 2, 1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect, expect_dims);
+  const float expect1[8] = {3, 4, 9, 10, 15, 16, 21, 22};
+  const int32_t expect_dims1[output_dim_size] = {2, 2, 1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   3, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect1, expect_dims1);
+  const float expect2[8] = {15, 16, 17, 21, 22, 23};
+  const int32_t expect_dims2[output_dim_size] = {1, 2, 1, 3};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 8, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect2, expect_dims2);
+  const float expect3[8] = {15, 21};
+  const int32_t output_dim_size3 = 3;
+  const int32_t expect_dims3[output_dim_size3] = {1, 2, 1};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 8, 0, 0, 8, output_dim_size3,
+                   output, output_dims, expect3, expect_dims3);
+  const float expect4[8] = {15};
+  const int32_t output_dim_size4 = 0;
+  const int32_t *expect_dims4 = NULL;
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices, end_indices, strides,
+                   indices_dims, indices_dim_size,
+                   0, 8, 0, 0, 15, output_dim_size4,
+                   output, output_dims, expect4, expect_dims4);
+  const int32_t begin_indices5[] = {-1, 2, 1, 3};
+  const int32_t end_indices5[] = {0, 0, 0, 0};
+  const int32_t strides5[] = {-1, -1, -1, -1};
+  const float expect5[2] = {23, 22};
+  const int32_t expect_dims5[output_dim_size] = {1, 1, 1, 2};
+  TestStridedSlice(input, input_dims, input_dim_size,
+                   begin_indices5, end_indices5, strides5,
+                   indices_dims, indices_dim_size,
+                   0, 0, 0, 0, 0, output_dim_size,
+                   output, output_dims, expect5, expect_dims5);
+}
+void TestSlice() {
+  const float input[] = {1, 2, 3, 4, 5, 6};
+  const int32_t begin_indices[] = {0, 0};
+  const int32_t indice_sizes[] = {2, 3};
+  const int32_t indices_dim_size = 1;
+  const int32_t indices_dims[indices_dim_size] = {2};
+  const int32_t input_dim_size = 2;
+  const int32_t input_dims[input_dim_size] = {2, 3};
+  float output[6] = {0};
+  const int32_t output_dim_size = 2;
+  int32_t output_dims[output_dim_size] = {0};
+  const float expect[6] = {1, 2, 3, 4, 5, 6};
+  const int32_t expect_dims[output_dim_size] = {2, 3};
+  TestSlice(input, input_dims, input_dim_size,
+            begin_indices, indice_sizes,
+            indices_dims, indices_dim_size,
+            output, output_dims, output_dim_size,
+            expect, expect_dims);
+  const int32_t begin_indices1[] = {1, 0};
+  const int32_t indice_sizes1[] = {1, 2};
+  const float expect1[2] = {4, 5};
+  const int32_t expect_dims1[output_dim_size] = {1, 2};
+  TestSlice(input, input_dims, input_dim_size,
+            begin_indices1, indice_sizes1,
+            indices_dims, indices_dim_size,
+            output, output_dims, output_dim_size,
+            expect1, expect_dims1);
+  const int32_t begin_indices2[] = {0, 1};
+  const int32_t indice_sizes2[] = {2, -1};
+  const float expect2[4] = {2, 3, 5, 6};
+  const int32_t expect_dims2[output_dim_size] = {2, 2};
+  TestSlice(input, input_dims, input_dim_size,
+            begin_indices2, indice_sizes2,
+            indices_dims, indices_dim_size,
+            output, output_dims, output_dim_size,
+            expect2, expect_dims2);
+}
+}  // namespace
+TEST_F(StridedSliceOpTest, TestStridedSliceByFirstAxis) {
+  TestStridedSliceByFirstAxis();
+}
+TEST_F(StridedSliceOpTest, TestStridedSliceRank1) {
+  TestStridedSliceRank1();}
+TEST_F(StridedSliceOpTest, TestStridedSliceRank2) {
+  TestStridedSliceRank2();
+}
+TEST_F(StridedSliceOpTest, TestStridedSliceRank3) {
+  TestStridedSliceRank3();
+}
+TEST_F(StridedSliceOpTest, TestStridedSliceRank4) {
+  TestStridedSliceRank4();
+}
+TEST_F(StridedSliceOpTest, TestSlice) {
+  TestSlice();
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccutils/BUILD.bazel
+++ b/micro/test/ccutils/BUILD.bazel
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+filegroup(
+    name = "qaic",
+    srcs = ["rpc/qaic.sh"],
+    visibility = ["//visibility:public"],
+)
+cc_library(
+    name = "rpc_stub",
+    srcs = glob([
+        "rpc/stub/*.cc",
+    ]),
+    hdrs = glob([
+        "rpc/stub/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    strip_include_prefix = "",
+    deps = [
+        "@hexagon_sdk//:headers_arm",
+    ],
+    alwayslink = 1,
+)
+cc_library(
+    name = "rpc_skel",
+    srcs = glob([
+        "rpc/skel/*.cc",
+    ]),
+    hdrs = glob([
+        "rpc/skel/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    strip_include_prefix = "",
+    deps = [
+        "@hexagon_sdk//:headers_dsp",
+    ],
+    alwayslink = 1,
+)
+cc_library(
+    name = "ccutils",
+    srcs = glob([
+        "micro/common/*.cc",
+        "micro/ops/*.cc",
+    ]),
+    hdrs = glob(
+        [
+            "micro/common/*.h",
+            "micro/ops/*.h",
+        ],
+        exclude = ["micro/ops/gtest_utils.h"],
+    ),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    strip_include_prefix = "",
+    deps = [
+        "//micro/base",
+        "//micro/framework:framework_for_optest",
+        "//micro/include",
+    ],
+)
+cc_library(
+    name = "ccutils_with_gtest",
+    srcs = glob([
+        "micro/common/*.cc",
+        "micro/ops/*.cc",
+    ]),
+    hdrs = glob([
+        "micro/common/*.h",
+        "micro/ops/*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    strip_include_prefix = "",
+    deps = [
+        "//micro/base",
+        "//micro/framework:framework_for_optest",
+        "//micro/include",
+        "@gtest",
+    ],
+)
--- a/micro/test/ccutils/micro/common/global_buffer.cc
+++ b/micro/test/ccutils/micro/common/global_buffer.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/common/global_buffer.h"
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace common {
+namespace test {
+namespace {
+// for N=1, H=128, W=128, C=4, INPUT1&INPUT2&OUTPUT, sizeof(float)
+const uint32_t kGlobalBufferSize = 128 * 128 * 4 * 3 * 4;
+uint8_t kGlobalBuffer[kGlobalBufferSize];
+GlobalBuffer global_buffer;
+}
+GlobalBuffer::GlobalBuffer() : offset_(0) {}
+GlobalBuffer::~GlobalBuffer() {}
+void GlobalBuffer::reset() {
+  offset_ = 0;
+}
+void *GlobalBuffer::DoGetBuffer(uint32_t size) {
+  if (size % 4 != 0) {
+    size = (size + 3) / 4 * 4;
+  }
+  if (offset_ + size > kGlobalBufferSize) {
+    LOG(FATAL) << "Global buffer is not enough."
+               << "offset_: " << offset_ << ", size: " << size
+               << ", kGlobalBufferSize: " << kGlobalBufferSize;
+  }
+  void *ptr = kGlobalBuffer + offset_;
+  offset_ += size;
+  return ptr;
+}
+GlobalBuffer *GetGlobalBuffer() {
+  return &global_buffer;
+}
+}  // namespace test
+}  // namespace common
+}  // namespace micro
--- a/micro/test/ccutils/micro/common/global_buffer.h
+++ b/micro/test/ccutils/micro/common/global_buffer.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_MICRO_COMMON_GLOBAL_BUFFER_H_
+#define MICRO_TEST_CCUTILS_MICRO_COMMON_GLOBAL_BUFFER_H_
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace common {
+namespace test {
+class GlobalBuffer {
+ public:
+  GlobalBuffer();
+  ~GlobalBuffer();
+  void reset();
+  template<typename T>
+  T *GetBuffer(int32_t size) {
+    MACE_ASSERT(size > 0);
+    return static_cast<T *>(
+        DoGetBuffer(static_cast<uint32_t>(size) * sizeof(T)));
+  }
+  template<typename T>
+  T *GetBuffer(uint32_t size) {
+    return static_cast<T *>(DoGetBuffer(size * sizeof(T)));
+  }
+ private:
+  void *DoGetBuffer(uint32_t size);
+ private:
+  uint32_t offset_;
+};
+GlobalBuffer *GetGlobalBuffer();
+}  // namespace test
+}  // namespace common
+}  // namespace micro
+#endif  // MICRO_TEST_CCUTILS_MICRO_COMMON_GLOBAL_BUFFER_H_
--- a/micro/test/ccutils/micro/ops/gtest_utils.h
+++ b/micro/test/ccutils/micro/ops/gtest_utils.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_MICRO_OPS_GTEST_UTILS_H_
+#define MICRO_TEST_CCUTILS_MICRO_OPS_GTEST_UTILS_H_
+#include "gtest/gtest.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/include/public/micro.h"
+#include "micro/include/utils/macros.h"
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+template<typename T>
+inline void ExpectEqual(const T &a, const T &b) {
+  EXPECT_EQ(a, b);
+}
+template<>
+inline void ExpectEqual<float>(const float &a, const float &b) {
+  EXPECT_FLOAT_EQ(a, b);
+}
+template<>
+inline void ExpectEqual<double>(const double &a, const double &b) {
+  EXPECT_DOUBLE_EQ(a, b);
+}
+template<typename EXP_TYPE,
+    typename RES_TYPE,
+    bool is_fp = true>
+struct Expector;
+// Partial specialization for float and double.
+template<typename EXP_TYPE, typename RES_TYPE>
+struct Expector<EXP_TYPE, RES_TYPE, true> {
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
+  static void Equal(
+      const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+      const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size) {
+    AssertSameDims(x_dims, x_dim_size, y_dims, y_dim_size);
+    const int32_t size = base::GetShapeSize(x_dim_size, x_dims);
+    for (int32_t i = 0; i < size; ++i) {
+      ExpectEqual(x[i], y[i]);
+    }
+  }
+  static void Near(
+      const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+      const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size,
+      const double rel_err, const double abs_err) {
+    AssertSameDims(x_dims, x_dim_size, y_dims, y_dim_size);
+    if (x_dim_size == 4) {
+      for (int32_t n = 0; n < x_dims[0]; ++n) {
+        for (int32_t h = 0; h < x_dims[1]; ++h) {
+          for (int32_t w = 0; w < x_dims[2]; ++w) {
+            for (int32_t c = 0; c < x_dims[3]; ++c) {
+              const double error = abs_err + rel_err * base::abs(*x);
+              EXPECT_NEAR(*x, *y, error) << "with index = [" << n << ", " << h
+                                         << ", " << w << ", " << c << "]";
+              x++;
+              y++;
+            }
+          }
+        }
+      }
+    } else {
+      const int32_t size = base::GetShapeSize(x_dim_size, x_dims);
+      for (int32_t i = 0; i < size; ++i) {
+        const double error = abs_err + rel_err * base::abs(x[i]);
+        EXPECT_NEAR(x[i], y[i], error);
+      }
+    }
+  }
+};
+template<typename EXP_TYPE, typename RES_TYPE>
+struct Expector<EXP_TYPE, RES_TYPE, false> {
+  static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
+  static void Equal(
+      const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+      const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size) {
+    AssertSameDims(x_dims, x_dim_size, y_dims, y_dim_size);
+    const int32_t size = base::GetShapeSize(x_dim_size, x_dims);
+    for (int32_t i = 0; i < size; ++i) {
+      ExpectEqual(x[i], y[i]);
+    }
+  }
+  static void Near(
+      const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+      const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size,
+      const double rel_err, const double abs_err) {
+    MACE_UNUSED(rel_err);
+    MACE_UNUSED(abs_err);
+    Equal(x, x_dims, x_dim_size, y, y_dims, y_dim_size);
+  }
+};
+template<typename EXP_TYPE, typename RES_TYPE>
+void ExpectTensorNear(
+    const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+    const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size,
+    const double rel_err = 1e-5, const double abs_err = 1e-8) {
+  Expector<EXP_TYPE, RES_TYPE>::Near(x, x_dims, x_dim_size, y,
+                                     y_dims, y_dim_size, rel_err, abs_err);
+}
+template<typename T>
+void ExpectTensorNear(
+    const T *x, const int32_t *x_dims, const uint32_t x_dim_size,
+    const T *y, const int32_t *y_dims, const uint32_t y_dim_size,
+    const double rel_err = 1e-5, const double abs_err = 1e-8) {
+  Expector<T, T>::Near(x, x_dims, x_dim_size, y,
+                       y_dims, y_dim_size, rel_err, abs_err);
+}
+template<typename EXP_TYPE, typename RES_TYPE>
+void ExpectTensorSimilar(
+    const EXP_TYPE *x, const int32_t *x_dims, const uint32_t x_dim_size,
+    const RES_TYPE *y, const int32_t *y_dims, const uint32_t y_dim_size,
+    const double rel_err = 1e-5) {
+  AssertSameDims(x_dims, x_dim_size, y_dims, y_dim_size);
+  const int32_t size = base::GetShapeSize(x_dim_size, x_dims);
+  double dot_product = 0.0, x_norm = 0.0, y_norm = 0.0;
+  for (int32_t i = 0; i < size; i++) {
+    dot_product += x[i] * y[i];
+    x_norm += x[i] * x[i];
+    y_norm += y[i] * y[i];
+  }
+  double norm_product = base::sqrt(x_norm) * base::sqrt(y_norm);
+  double error = rel_err * base::abs(dot_product);
+  EXPECT_NEAR(dot_product, norm_product, error);
+  PrintDims(x_dims, x_dim_size);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_TEST_CCUTILS_MICRO_OPS_GTEST_UTILS_H_
--- a/micro/test/ccutils/micro/ops/operator.test.cc
+++ b/micro/test/ccutils/micro/ops/operator.test.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/include/utils/macros.h"
+#include "micro/ops/substitute_op.h"
+namespace micro {
+namespace framework {
+Operator::~Operator() {}
+#ifndef fake_op_
+#define fake_op_  (reinterpret_cast<SubstituteOp *>(op_context_))
+#endif  // fake_op_
+const uint32_t kScratchBufferSize = 100000;
+uint8_t kScratchBuffer[kScratchBufferSize] = {0};
+MaceMicroEngineConfig kTmpMicroEngineConfig = {
+    NULL,  // net_def_;
+    NULL,  // model_data_;
+    NULL,  // graph_;
+    NULL,  // op_array_;
+    NULL,  // tensor_mem_;
+    NULL,  // input_buffers_;
+    NULL,  // input_shapes_;
+    kScratchBuffer,
+    kScratchBufferSize,
+};
+MaceStatus Operator::Init(MaceMicroEngineConfig *engine_config,
+                          framework::OpContext *op_context,
+                          const model::OperatorDef *op_def) {
+  engine_config_ = &kTmpMicroEngineConfig;
+  op_context_ = op_context;
+  MACE_UNUSED(engine_config);
+  MACE_UNUSED(op_def_);
+  MACE_UNUSED(op_def);
+  return OnInit();
+}
+MaceStatus Operator::OnInit() {
+  return MACE_SUCCESS;
+}
+MaceStatus Operator::Run() {
+  MACE_NOT_IMPLEMENTED;
+  return MACE_SUCCESS;
+}
+const model::Argument *Operator::GetArgByName(const char *name) const {
+  MACE_UNUSED(name);
+  MACE_ASSERT1(false, "Thsi method should not be invoked.");
+  return NULL;
+}
+uint32_t Operator::GetInputSize() {
+  return fake_op_->GetInputSize();
+}
+const void *Operator::DoGetInputData(uint32_t idx) {
+  return fake_op_->DoGetInputData(idx);
+}
+uint32_t Operator::GetInputShapeDimSize(uint32_t idx) {
+  return fake_op_->GetInputShapeDimSize(idx);
+}
+const int32_t *Operator::GetInputShapeDims(uint32_t idx) {
+  return fake_op_->GetInputShapeDims(idx);
+}
+uint32_t Operator::GetOutputSize() {
+  return fake_op_->GetOutputSize();
+}
+void *Operator::DoGetOutputData(uint32_t idx) {
+  return fake_op_->DoGetOutputData(idx);
+}
+uint32_t Operator::GetOutputShapeDimSize(uint32_t idx) {
+  return fake_op_->GetOutputShapeDimSize(idx);
+}
+const int32_t *Operator::GetOutputShapeDims(uint32_t idx) {
+  return fake_op_->GetOutputShapeDims(idx);
+}
+MaceStatus Operator::ResizeOutputShape(uint32_t idx, uint32_t dim_size,
+                                       const int32_t *dims) {
+  return fake_op_->ResizeOutputShape(idx, dim_size, dims);
+}
+#ifndef MACE_DEFINE_GET_ARG_BY_NAME_FUNC
+#define MACE_DEFINE_GET_ARG_BY_NAME_FUNC(T, FUNC)                   \
+template <>                                                         \
+T Operator::GetArgByName(const char *name, T default_value) const { \
+  return fake_op_->GetArgByName<T>(name, default_value);            \
+}
+#endif  // MACE_DEFINE_GET_ARG_BY_NAME_FUNC
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(bool, i)
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(int32_t, i)
+MACE_DEFINE_GET_ARG_BY_NAME_FUNC(float, f)
+#ifndef MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC
+#define MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(T, FUNC)       \
+template <>                                                   \
+const T *Operator::GetRepeatArgByName(const char *name,       \
+                                      uint32_t *size) const { \
+  return fake_op_->GetRepeatArgByName<T>(name, size);         \
+}
+#endif  // MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(int32_t, ints)
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(float, floats)
+MACE_DEFINE_GET_ARRAY_ARG_BY_NAME_FUNC(uint8_t, s)
+}  // namespace framework
+}  // namespace micro
--- a/micro/test/ccutils/micro/ops/substitute_op.cc
+++ b/micro/test/ccutils/micro/ops/substitute_op.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/include/public/micro.h"
+#include "micro/include/utils/macros.h"
+#include "micro/ops/substitute_op.h"
+namespace micro {
+namespace framework {
+SubstituteOp::SubstituteOp()
+    : input_idx_(0), output_idx_(0), arg_idx_(0), repeat_arg_idx_(0) {}
+SubstituteOp &SubstituteOp::AddInput(
+    const void *input, const int32_t *dims, const uint32_t dims_size) {
+  MACE_ASSERT1(input != NULL || dims != NULL || dims_size == 0,
+               "Invalid param");
+  MACE_ASSERT1(input_idx_ < kMaxInputNum, "Not enough mem.");
+  inputs_[input_idx_] = input;
+  input_dims_[input_idx_] = dims;
+  input_dim_sizes_[input_idx_] = dims_size;
+  ++input_idx_;
+  return *this;
+}
+SubstituteOp &SubstituteOp::AddOutput(
+    void *output, int32_t *dims, const uint32_t dims_size) {
+  MACE_ASSERT1(output != NULL || dims != NULL || dims_size == 0,
+               "Invalid param");
+  MACE_ASSERT1(output_idx_ < kMaxOutputNum, "Not enough mem.");
+  outputs_[output_idx_] = output;
+  output_dims_[output_idx_] = dims;
+  output_dim_sizes_[output_idx_] = dims_size;
+  ++output_idx_;
+  return *this;
+}
+uint32_t SubstituteOp::GetInputSize() {
+  return input_idx_;
+}
+const void *SubstituteOp::DoGetInputData(uint32_t idx) {
+  MACE_ASSERT1(idx < input_idx_, "idx is not valid");
+  return inputs_[idx];
+}
+uint32_t SubstituteOp::GetInputShapeDimSize(uint32_t idx) {
+  MACE_ASSERT1(idx < input_idx_, "idx is not valid");
+  return input_dim_sizes_[idx];
+}
+const int32_t *SubstituteOp::GetInputShapeDims(uint32_t idx) {
+  MACE_ASSERT1(idx < input_idx_, "idx is not valid");
+  return input_dims_[idx];
+}
+uint32_t SubstituteOp::GetOutputSize() {
+  return output_idx_;
+}
+void *SubstituteOp::DoGetOutputData(uint32_t idx) {
+  MACE_ASSERT1(idx < output_idx_, "idx is not valid");
+  return outputs_[idx];
+}
+uint32_t SubstituteOp::GetOutputShapeDimSize(uint32_t idx) {
+  MACE_ASSERT1(idx < output_idx_, "idx is not valid");
+  return output_dim_sizes_[idx];
+}
+const int32_t *SubstituteOp::GetOutputShapeDims(uint32_t idx) {
+  MACE_ASSERT1(idx < output_idx_, "idx is not valid");
+  return output_dims_[idx];
+}
+MaceStatus SubstituteOp::ResizeOutputShape(uint32_t idx,
+                                           uint32_t input_dim_size,
+                                           const int32_t *input_dims) {
+  MACE_ASSERT1(idx < output_idx_, "idx is not valid");
+  MACE_ASSERT1(input_dim_size <= output_dim_sizes_[idx],
+               "Can not support dynamic dim size");
+  if (output_dims_[idx] != NULL && input_dim_size > 0) {
+    base::memcpy(output_dims_[idx], input_dims,
+                 sizeof(int32_t) * input_dim_size);
+  }
+  output_dim_sizes_[idx] = input_dim_size;
+  return MACE_SUCCESS;
+}
+MaceStatus SubstituteOp::ReuseInputBufferForOutput(uint32_t output_idx,
+                                                   uint32_t input_idx) {
+  MACE_UNUSED(output_idx);
+  MACE_UNUSED(input_idx);
+  return MACE_SUCCESS;
+}
+}  // namespace framework
+}  // namespace micro
--- a/micro/test/ccutils/micro/ops/substitute_op.h
+++ b/micro/test/ccutils/micro/ops/substitute_op.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_MICRO_OPS_SUBSTITUTE_OP_H_
+#define MICRO_TEST_CCUTILS_MICRO_OPS_SUBSTITUTE_OP_H_
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace framework {
+const uint32_t kMaxInputNum = 10;
+const uint32_t kMaxOutputNum = 4;
+const uint32_t kMaxArgNum = 20;
+struct Arg {
+  const char *name;
+  float value;
+};
+struct RepeatArg {
+  const char *name;
+  const void *ptr;
+  uint32_t length;
+};
+class SubstituteOp {
+ public:
+  SubstituteOp();
+  ~SubstituteOp() {}
+  SubstituteOp &AddInput(const void *input,
+                         const int32_t *dims, const uint32_t dims_size);
+  SubstituteOp &AddOutput(void *output,
+                          int32_t *dims, const uint32_t dims_size);
+  template<typename T>
+  SubstituteOp &AddArg(const char *name, T value) {
+    MACE_ASSERT(arg_idx_ < kMaxArgNum);
+    args_[arg_idx_].name = name;
+    args_[arg_idx_].value = static_cast<float>(value);
+    ++arg_idx_;
+    return *this;
+  }
+  template<typename T>
+  SubstituteOp &AddRepeatArg(const char *name, const T *value, uint32_t len) {
+    MACE_ASSERT(repeat_arg_idx_ < kMaxArgNum);
+    repeat_args_[repeat_arg_idx_].name = name;
+    repeat_args_[repeat_arg_idx_].ptr = value;
+    repeat_args_[repeat_arg_idx_].length = len;
+    ++repeat_arg_idx_;
+    return *this;
+  }
+ public:
+  template<typename T>
+  T GetArgByName(const char *name, T default_value) const {
+    for (uint32_t i = 0; i < arg_idx_; ++i) {
+      if (base::strcmp(name, args_[i].name) == 0) {
+        return static_cast<T>(args_[i].value);
+      }
+    }
+    return default_value;
+  }
+  template<typename T>
+  const T *GetRepeatArgByName(
+      const char *name, uint32_t *size = NULL) const {
+    for (uint32_t i = 0; i < repeat_arg_idx_; ++i) {
+      if (base::strcmp(name, repeat_args_[i].name) == 0) {
+        if (size != NULL) {
+          *size = repeat_args_[i].length;
+        }
+        return static_cast<const T *>(repeat_args_[i].ptr);
+      }
+    }
+    if (size != NULL) {
+      *size = 0;
+    }
+    return NULL;
+  }
+  uint32_t GetInputSize();
+  const void *DoGetInputData(uint32_t idx);
+  uint32_t GetInputShapeDimSize(uint32_t idx);
+  const int32_t *GetInputShapeDims(uint32_t idx);
+  uint32_t GetOutputSize();
+  void *DoGetOutputData(uint32_t idx);
+  uint32_t GetOutputShapeDimSize(uint32_t idx);
+  const int32_t *GetOutputShapeDims(uint32_t idx);
+  MaceStatus ResizeOutputShape(uint32_t idx, uint32_t input_dim_size,
+                               const int32_t *input_dims);
+  MaceStatus ReuseInputBufferForOutput(uint32_t output_idx, uint32_t input_idx);
+  template<typename T>
+  const T *GetInputData(uint32_t idx) {
+    return static_cast<const T *>(DoGetInputData(idx));
+  }
+  template<typename T>
+  T *GetOutputData(uint32_t idx) {
+    return static_cast<T *>(DoGetOutputData(idx));
+  }
+ private:
+  const void *inputs_[kMaxInputNum];
+  const int32_t *input_dims_[kMaxInputNum];
+  uint32_t input_dim_sizes_[kMaxInputNum];
+  uint32_t input_idx_;
+  void *outputs_[kMaxOutputNum];
+  int32_t *output_dims_[kMaxOutputNum];
+  uint32_t output_dim_sizes_[kMaxOutputNum];
+  uint32_t output_idx_;
+  // for arg
+  Arg args_[kMaxArgNum];
+  uint32_t arg_idx_;
+  RepeatArg repeat_args_[kMaxArgNum];
+  uint32_t repeat_arg_idx_;
+};
+}  // namespace framework
+}  // namespace micro
+#endif  // MICRO_TEST_CCUTILS_MICRO_OPS_SUBSTITUTE_OP_H_
--- a/micro/test/ccutils/micro/ops/test_utils.cc
+++ b/micro/test/ccutils/micro/ops/test_utils.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "micro/ops/test_utils.h"
+namespace micro {
+namespace ops {
+namespace test {
+namespace {
+// for FillRandomInput
+const int32_t kRandM = 1 << 20;
+const int32_t kRandA = 9;
+const int32_t kRandB = 7;
+}
+void PrintDims(const int32_t *dims, const uint32_t dim_size) {
+  MACE_ASSERT1(dim_size > 0, "invalide dim size");
+  if (dim_size == 1) {
+    LOG(INFO) << "[ " << dims[0] << " ]";
+  } else if (dim_size == 2) {
+    LOG(INFO) << "[ " << dims[0] << ", " << dims[1] << " ]";
+  } else if (dim_size == 3) {
+    LOG(INFO) << "[ " << dims[0] << ", " << dims[1] << ", " << dims[2] << " ]";
+  } else if (dim_size == 4) {
+    LOG(INFO) << "[ " << dims[0] << ", " << dims[1]
+              << ", " << dims[2] << ", " << dims[3] << " ]";
+  } else {
+    for (uint32_t i = 0; i < dim_size; ++i) {
+      LOG(INFO) << dims[i];
+    }
+  }
+}
+void AssertSameDims(const int32_t *x_dims, const uint32_t x_dim_size,
+                    const int32_t *y_dims, const uint32_t y_dim_size) {
+  if (x_dim_size != y_dim_size) {
+    LOG(FATAL) << "invalide dim size. x_dim_size = " << x_dim_size
+               << ", y_dim_size = " << y_dim_size;
+  }
+  for (uint32_t i = 0; i < x_dim_size; ++i) {
+    if (x_dims[i] != y_dims[i]) {
+      PrintDims(x_dims, x_dim_size);
+      PrintDims(y_dims, y_dim_size);
+      LOG(FATAL) << "AssertSameDims failed.";
+    }
+  }
+}
+void FillRandomInput(void *input, const int32_t shape_size) {
+  uint8_t *mem = static_cast<uint8_t * > (input);
+  mem[0] = port::api::NowMicros() % 256;
+  for (int32_t i = 1; i < shape_size; ++i) {
+    mem[i] = (kRandA * mem[i - 1] + kRandB) % kRandM;
+  }
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
--- a/micro/test/ccutils/micro/ops/test_utils.h
+++ b/micro/test/ccutils/micro/ops/test_utils.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_MICRO_OPS_TEST_UTILS_H_
+#define MICRO_TEST_CCUTILS_MICRO_OPS_TEST_UTILS_H_
+#include "micro/base/logging.h"
+#include "micro/common/global_buffer.h"
+#include "micro/include/public/micro.h"
+#include "micro/port/api.h"
+namespace micro {
+namespace ops {
+namespace test {
+void PrintDims(const int32_t *dims, const uint32_t dim_size);
+void AssertSameDims(const int32_t *x_dims, const uint32_t x_dim_size,
+                    const int32_t *y_dims, const uint32_t y_dim_size);
+void FillRandomInput(void *input, const int32_t shape_size);
+#ifndef MACE_DEFINE_RANDOM_INPUT
+#define MACE_DEFINE_RANDOM_INPUT(T, input, shape_size)                \
+T *input = common::test::GetGlobalBuffer()->GetBuffer<T>(shape_size); \
+micro::ops::test::FillRandomInput(input, shape_size * sizeof(T))
+#endif
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
+#endif  // MICRO_TEST_CCUTILS_MICRO_OPS_TEST_UTILS_H_
--- a/micro/test/ccutils/rpc/qaic.sh
+++ b/micro/test/ccutils/rpc/qaic.sh
+#!/usr/bin/env bash
+output_dir=${1}
+mkdir -p output_dir
+echo $HEXAGON_SDK_ROOT/tools/qaic/Ubuntu16/qaic \
+     		-mdll -o ${output_dir} \
+     		-I$HEXAGON_SDK_ROOT/libs/fastcv/dspCV/android_Debug/ship \
+     		-I$HEXAGON_SDK_ROOT/libs/common/rpcmem/android_Debug/ship \
+     		-I$HEXAGON_SDK_ROOT/libs/common/adspmsgd/ship/android_Debug \
+     		-I$HEXAGON_SDK_ROOT/incs \
+     		-I$HEXAGON_SDK_ROOT/libs/common/remote/ship/android_Debug \
+     		-I$HEXAGON_SDK_ROOT/incs/stddef \
+     		${@:2}
+$HEXAGON_SDK_ROOT/tools/qaic/Ubuntu16/qaic \
+		-mdll -o ${output_dir} \
+		-I$HEXAGON_SDK_ROOT/libs/fastcv/dspCV/android_Debug/ship \
+		-I$HEXAGON_SDK_ROOT/libs/common/rpcmem/android_Debug/ship \
+		-I$HEXAGON_SDK_ROOT/libs/common/adspmsgd/ship/android_Debug \
+		-I$HEXAGON_SDK_ROOT/incs \
+		-I$HEXAGON_SDK_ROOT/libs/common/remote/ship/android_Debug \
+		-I$HEXAGON_SDK_ROOT/incs/stddef \
+		${@:2}
--- a/micro/test/ccutils/rpc/skel/base_func.cc
+++ b/micro/test/ccutils/rpc/skel/base_func.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "rpc/skel/base_func.h"
+#include <HAP_perf.h>
+namespace rpc {
+namespace skel {
+namespace {
+// for FillRandomValue
+const int32_t kRandM = 1 << 20;
+const int32_t kRandA = 9;
+const int32_t kRandB = 7;
+}
+void FillRandomValue(void *buffer, const int32_t buffer_size) {
+  uint8_t *mem = static_cast<uint8_t * > (buffer);
+  mem[0] = HAP_perf_get_time_us() % 256;
+  for (int32_t i = 1; i < buffer_size; ++i) {
+    mem[i] = (kRandA * mem[i - 1] + kRandB) % kRandM;
+  }
+}
+}  // namespace skel
+}  // namespace rpc
--- a/micro/test/ccutils/rpc/skel/base_func.h
+++ b/micro/test/ccutils/rpc/skel/base_func.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_RPC_SKEL_BASE_FUNC_H_
+#define MICRO_TEST_CCUTILS_RPC_SKEL_BASE_FUNC_H_
+#include <HAP_perf.h>
+#include <stdlib.h>
+#include "AEEStdErr.h"  // NOLINT
+#include "remote.h"  // NOLINT
+#ifndef MACE_DEFINE_RANDOM_INPUT
+#define MACE_DEFINE_RANDOM_INPUT(NAME)                   \
+static remote_handle64 h##NAME = -1;                     \
+int NAME##_open(const char *uri, remote_handle64 *h) {   \
+  if (h##NAME == -1) {                                   \
+    h##NAME = (remote_handle64)(HAP_perf_get_time_us()); \
+  }                                                      \
+  if (h##NAME == NULL) {                                 \
+    h##NAME = -1;                                        \
+    return AEE_ENOMEMORY;                                \
+  }                                                      \
+  *h = h##NAME;                                          \
+  return AEE_SUCCESS;                                    \
+}                                                        \
+int NAME##_close(remote_handle64 h) {                    \
+  if (h != h##NAME) {                                    \
+    return AEE_EBADPARM;                                 \
+  }                                                      \
+  if (h##NAME != -1) {                                   \
+  }                                                      \
+  h##NAME = -1;                                          \
+  return AEE_SUCCESS;                                    \
+}
+#endif  // MACE_DEFINE_RANDOM_INPUT
+#ifdef __cplusplus
+namespace rpc {
+namespace skel {
+#endif  // __cplusplus
+void FillRandomValue(void *input, const int32_t shape_size);
+#ifdef __cplusplus
+}  // namespace skel
+}  // namespace rpc
+#endif  // __cplusplus
+#endif  // MICRO_TEST_CCUTILS_RPC_SKEL_BASE_FUNC_H_
--- a/micro/test/ccutils/rpc/stub/base_handle.cc
+++ b/micro/test/ccutils/rpc/stub/base_handle.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "rpc/stub/base_handle.h"
+namespace rpc {
+namespace stub {
+namespace {
+const remote_handle64 IVALID_HANDLE = -1;
+}
+BaseHandle::BaseHandle(FuncOpen *func_open,
+                       FuncClose *func_close,
+                       const char *uri)
+    : func_open_(func_open),
+      func_close_(func_close),
+      uri_(uri),
+      remote_handle_(IVALID_HANDLE) {}
+BaseHandle::~BaseHandle() {
+  Close();
+}
+bool BaseHandle::Open() {
+  if (Valid()) {
+    return true;
+  }
+  int ret = func_open_(uri_, &remote_handle_);
+  if (ret != 0 || remote_handle_ == IVALID_HANDLE) {
+    remote_handle_ = IVALID_HANDLE;
+    return false;
+  } else {
+    return true;
+  }
+}
+bool BaseHandle::Close() {
+  bool status = true;
+  if (Valid()) {
+    int ret = func_close_(remote_handle_);
+    remote_handle_ = IVALID_HANDLE;
+    if (ret != 0) {
+      status = false;
+    }
+  }
+  return status;
+}
+bool BaseHandle::Valid() {
+  return (remote_handle_ != IVALID_HANDLE);
+}
+}  // namespace stub
+}  // namespace rpc
--- a/micro/test/ccutils/rpc/stub/base_handle.h
+++ b/micro/test/ccutils/rpc/stub/base_handle.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MICRO_TEST_CCUTILS_RPC_STUB_BASE_HANDLE_H_
+#define MICRO_TEST_CCUTILS_RPC_STUB_BASE_HANDLE_H_
+#include <memory>
+#include "remote.h"  // NOLINT
+namespace rpc {
+namespace stub {
+class BaseHandle {
+ protected:
+  typedef int FuncOpen(const char *name, remote_handle64 *h);
+  typedef int FuncClose(remote_handle64 h);
+  FuncOpen *func_open_;
+  FuncClose *func_close_;
+  const char *uri_;
+  remote_handle64 remote_handle_;
+ public:
+  explicit BaseHandle(FuncOpen *func_open,
+                      FuncClose *func_close,
+                      const char *uri);
+  ~BaseHandle();
+  bool Open();
+  bool Close();
+  bool Valid();
+};
+}  // namespace stub
+}  // namespace rpc
+#endif  // MICRO_TEST_CCUTILS_RPC_STUB_BASE_HANDLE_H_
--- a/micro/tools/BUILD.bazel
+++ b/micro/tools/BUILD.bazel
+licenses(["notice"])  # Apache 2.0
+cc_binary(
+    name = "micro_run_static",
+    srcs = [
+        "micro_run.cc",
+    ],
+    copts = [
+        "-Werror",
+        "-std=c++11",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//external:gflags_nothreads",
+        "//micro/codegen:generated_models",
+        "//micro/codegen:micro_engine",
+    ],
+)
--- a/micro/tools/micro_run.cc
+++ b/micro/tools/micro_run.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * Usage:
+ * micro_run --input=input_node  \
+ *           --output=output_node  \
+ *           --input_shape=1,224,224,3   \
+ *           --output_shape=1,224,224,2   \
+ *           --input_file=input_data \
+ *           --output_file=micro.out
+ */
+#include <dirent.h>
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include "gflags/gflags.h"
+#include "micro/base/logging.h"
+#include "micro/include/public/micro.h"
+#include "micro/include/utils/macros.h"
+#include "micro/port/api.h"
+#ifndef MICRO_MODEL_NAME
+#error Please specify model name in the command
+#endif
+namespace micro {
+namespace MICRO_MODEL_NAME {
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+}
+namespace tools {
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+void ParseShape(const std::string &str, std::vector<int32_t> *shape) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::NONE;
+  }
+}
+DEFINE_string(model_name, "", "model name in yaml");
+DEFINE_string(input_node, "", "input nodes, separated by comma");
+DEFINE_string(input_shape, "",
+              "input shapes, separated by colon and comma");
+DEFINE_string(output_node, "", "output nodes, separated by comma");
+DEFINE_string(output_shape, "",
+              "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format, "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format, "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
+DEFINE_string(input_file, "",
+              "input file name | input file prefix for multiple inputs.");
+DEFINE_string(output_file, "",
+              "output file name | output file prefix for multiple outputs");
+DEFINE_string(input_dir, "", "input directory name");
+DEFINE_string(output_dir, "output", "output directory name");
+DEFINE_int32(round, 1, "round");
+DEFINE_int32(restart_round, 1, "restart round");
+DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
+DEFINE_bool(benchmark, false, "enable benchmark op");
+void GetOutputAndStoreToFile(MaceMicroEngine *micro_engine,
+                             const std::vector<std::string> &output_names,
+                             const std::string &prefix,
+                             const std::string &suffix) {
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    void *output_buffer = NULL;
+    const int32_t *output_dims = NULL;
+    uint32_t dim_size = 0;
+    MaceStatus status =
+        micro_engine->GetOutputData(i, &output_buffer, &output_dims, &dim_size);
+    MACE_UNUSED(status);
+    MACE_ASSERT1(status == MACE_SUCCESS && output_buffer != NULL,
+                 "GetOutputData failed");
+    std::string output_name = prefix + FormatName(output_names[i]) + suffix;
+    std::ofstream out_file(output_name, std::ios::binary);
+    MACE_ASSERT2(out_file.is_open(), "Open output file failed: ",
+                 strerror(errno));
+    int64_t output_size = std::accumulate(output_dims, output_dims + dim_size,
+                                          sizeof(float),
+                                          std::multiplies<int64_t>());
+    out_file.write(reinterpret_cast<char *>(output_buffer),
+                   output_size);
+    MACE_ASSERT1(!out_file.bad(), "write file failed!");
+    out_file.flush();
+    out_file.close();
+    LOG(INFO) << "Write output file " << output_name.c_str()
+              << " with size " << output_size << " done.";
+  }
+}
+bool RunModel(const std::vector<std::string> &input_names,
+              const std::vector<std::vector<int32_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
+              const std::vector<std::string> &output_names,
+              const std::vector<DataFormat> &output_data_formats) {
+  // for future
+  MACE_UNUSED(input_data_formats);
+  MACE_UNUSED(output_data_formats);
+  int64_t t0 = port::api::NowMicros();
+  MaceMicroEngine *micro_engine = NULL;
+  MaceStatus status = MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine);
+  MACE_UNUSED(status);
+  MACE_ASSERT(status == MACE_SUCCESS && micro_engine != NULL);
+  int64_t t1 = port::api::NowMicros();
+  double init_millis = (t1 - t0) / 1000.0;
+  LOG(INFO) << "Total init latency: "
+            << static_cast<float>(init_millis) << " ms";
+  std::vector<std::shared_ptr<char>> inputs;
+  std::vector<int32_t> input_sizes;
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
+    input_sizes.push_back(std::accumulate(input_shapes[i].begin(),
+                                          input_shapes[i].end(), sizeof(float),
+                                          std::multiplies<int32_t>()));
+    inputs.push_back(std::shared_ptr<char>(new char[input_sizes[i]],
+                                           std::default_delete<char[]>()));
+  }
+  if (!FLAGS_input_dir.empty()) {
+    DIR *dir_parent;
+    struct dirent *entry;
+    dir_parent = opendir(FLAGS_input_dir.c_str());
+    if (dir_parent == NULL) {
+      LOG(FATAL) << "Open input_dir " << FLAGS_input_dir.c_str()
+                 << " failed: " << strerror(errno);
+    }
+    while ((entry = readdir(dir_parent))) {
+      std::string file_name = std::string(entry->d_name);
+      std::string prefix = FormatName(input_names[0]);
+      if (file_name.find(prefix) == 0) {
+        std::string suffix = file_name.substr(prefix.size());
+        for (size_t i = 0; i < input_names.size(); ++i) {
+          file_name = FLAGS_input_dir + "/" + FormatName(input_names[i])
+              + suffix;
+          std::ifstream in_file(file_name, std::ios::in | std::ios::binary);
+          LOG(INFO) << "Read " << file_name.c_str();
+          MACE_ASSERT2(in_file.is_open(), "Open input file failed: ",
+                       strerror(errno));
+          in_file.read(inputs[i].get(), input_sizes[i]);
+          in_file.close();
+          micro_engine->RegisterInputData(i, inputs[i].get(),
+                                          input_shapes[i].data());
+        }
+        status = micro_engine->Run();
+        MACE_ASSERT(status == MACE_SUCCESS);
+        if (!FLAGS_output_dir.empty()) {
+          GetOutputAndStoreToFile(micro_engine, output_names,
+                                  FLAGS_output_dir + "/", suffix);
+        }
+      }
+    }
+    closedir(dir_parent);
+  } else {
+    for (size_t i = 0; i < input_names.size(); ++i) {
+      // load input
+      std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
+                            std::ios::in | std::ios::binary);
+      if (in_file.is_open()) {
+        in_file.read(inputs[i].get(), input_sizes[i]);
+        in_file.close();
+      } else {
+        LOG(INFO) << "Open input file failed";
+        return -1;
+      }
+      micro_engine->RegisterInputData(i, inputs[i].get(),
+                                      input_shapes[i].data());
+    }
+    LOG(INFO) << "Warm up run";
+    int64_t t3 = port::api::NowMicros();
+    status = micro_engine->Run();
+    MACE_ASSERT1(status == MACE_SUCCESS, "run micro engine failed");
+    int64_t t4 = port::api::NowMicros();
+    double warmup_millis = (t4 - t3) / 1000.0;
+    LOG(INFO) << "1st warm up run latency: "
+              << static_cast<float>(warmup_millis) << " ms";
+    double model_run_millis = -1;
+    if (FLAGS_round > 0) {
+      LOG(INFO) << "Run model";
+      int64_t total_run_duration = 0;
+      for (int i = 0; i < FLAGS_round; ++i) {
+        int64_t t0 = port::api::NowMicros();
+        // TODO(luxuhui): add metadata to benchmark op
+        status = micro_engine->Run();
+        MACE_ASSERT(status == MACE_SUCCESS);
+        int64_t t1 = port::api::NowMicros();
+        total_run_duration += (t1 - t0);
+      }
+      model_run_millis = total_run_duration / 1000.0 / FLAGS_round;
+      LOG(INFO) << "Average latency: "
+                << static_cast<float>(model_run_millis) << " ms";
+    }
+    GetOutputAndStoreToFile(micro_engine, output_names,
+                            FLAGS_output_file + "_", "");
+    // Metrics reporting tools depends on the format, keep in consistent
+    printf("=============================================\n");
+    printf("----        init       warmup     run_avg    \n");
+    printf("=============================================\n");
+    printf("time %11.3f %11.3f %11.3f\n",
+           init_millis, warmup_millis, model_run_millis);
+  }
+  return true;
+}
+int Main(int argc, char **argv) {
+  std::string usage = "MACE micro run model tool, please specify proper"
+                      " arguments.\nusage: " + std::string(argv[0]) + " --help";
+  gflags::SetUsageMessage(usage);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
+  if (input_names.empty() || output_names.empty()) {
+    LOG(INFO) << gflags::ProgramUsage();
+    return 0;
+  }
+  LOG(INFO) << "model name: " << FLAGS_model_name.c_str();
+  LOG(INFO) << "input node: " << FLAGS_input_node.c_str();
+  LOG(INFO) << "input shape: " << FLAGS_input_shape.c_str();
+  LOG(INFO) << "output node: " << FLAGS_output_node.c_str();
+  LOG(INFO) << "output shape: " << FLAGS_output_shape.c_str();
+  LOG(INFO) << "input_file: " << FLAGS_input_file.c_str();
+  LOG(INFO) << "output_file: " << FLAGS_output_file.c_str();
+  LOG(INFO) << "input dir: " << FLAGS_input_dir.c_str();
+  LOG(INFO) << "output dir: " << FLAGS_output_dir.c_str();
+  LOG(INFO) << "round: " << FLAGS_round;
+  LOG(INFO) << "restart_round: " << FLAGS_restart_round;
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<std::vector<int32_t>> input_shape_vec(input_count);
+  std::vector<std::vector<int32_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+  if (input_names.size() != input_shape_vec.size()
+      || output_names.size() != output_shape_vec.size()) {
+    LOG(INFO) << "inputs' names do not match inputs' shapes "
+                 "or outputs' names do not match outputs' shapes";
+    return 0;
+  }
+  std::vector<std::string> raw_input_data_formats =
+      Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+  bool ret = false;
+  for (int i = 0; i < FLAGS_restart_round; ++i) {
+    LOG(INFO) << "restart round " << i;
+    ret = RunModel(input_names, input_shape_vec, input_data_formats,
+                   output_names, output_data_formats);
+  }
+  if (ret) {
+    return 0;
+  }
+  return -1;
+}
+}  // namespace tools
+}  // namespace micro
+int main(int argc, char **argv) {
+  micro::tools::Main(argc, argv);
+}
--- a/test/ccbenchmark/mace/benchmark_utils/test_benchmark.h
+++ b/test/ccbenchmark/mace/benchmark_utils/test_benchmark.h
@@ -29,7 +29,7 @@ namespace testing {
 class Benchmark {
 public:
-  Benchmark(const char *name, void (*benchmark_func)(int));
+  Benchmark(const char *name, void (*benchmark_func)(int32_t));
  static void Run(const char *pattern);

--- a/third_party/hexagon/hexagon_sdk.BUILD
+++ b/third_party/hexagon/hexagon_sdk.BUILD
+package(default_visibility = ['//visibility:public'])
+filegroup(
+    name = 'sdk_location',
+    srcs = ["readme.txt"],
+)
+cc_library(
+    name = 'headers_incs',
+    hdrs = glob([
+        "incs/*.h",
+    ]),
+    strip_include_prefix = "incs/",
+)
+cc_library(
+    name = 'headers_incs_stddef',
+    hdrs = glob([
+        "incs/stddef/*.h",
+    ]),
+    strip_include_prefix = "incs/stddef/",
+)
+cc_library(
+    name = 'headers_dsp',
+    hdrs = glob([
+        "libs/common/remote/ship/hexagon_Release_toolv81_v60/*.h",
+    ]),
+    strip_include_prefix = "libs/common/remote/ship/hexagon_Release_toolv81_v60/",
+    deps = [
+        ":headers_incs",
+        ":headers_incs_stddef",
+        "@hexagon_tools//:headers_tools_target",
+    ],
+)
+cc_library(
+    name = 'headers_arm',
+    hdrs = glob([
+        "libs/common/remote/ship/android_Release_aarch64/*.h",
+    ]),
+    strip_include_prefix = "libs/common/remote/ship/android_Release_aarch64/",
+    deps = [
+        ":headers_incs",
+        ":headers_incs_stddef",
+    ],
+)
+cc_library(
+    name = 'sdk_arm',
+    srcs = glob([
+        "libs/common/remote/ship/android_Release_aarch64/libcdsprpc.so",
+        "libs/common/rpcmem/rpcmem.a",
+    ]),
+    deps = [
+        ":headers_arm",
+    ],
+)
\ No newline at end of file
--- a/third_party/hexagon/hexagon_tools.BUILD
+++ b/third_party/hexagon/hexagon_tools.BUILD
+package(default_visibility = ['//visibility:public'])
+cc_library(
+    name = "headers_tools_target",
+    hdrs = glob([
+        "target/hexagon/include/**/*.h",
+    ]),
+    strip_include_prefix = "target/hexagon/include/",
+)
+filegroup(
+    name = 'gcc',
+    srcs = [
+        'bin/hexagon-clang',
+    ],
+)
+filegroup(
+    name = 'ar',
+    srcs = [
+        'bin/hexagon-ar',
+    ],
+)
+filegroup(
+    name = 'ld',
+    srcs = [
+        'bin/hexagon-link',
+    ],
+)
+filegroup(
+    name = 'nm',
+    srcs = [
+        'bin/hexagon-nm',
+    ],
+)
+filegroup(
+    name = 'objcopy',
+    srcs = [
+        'bin/hexagon-elfcopy',
+    ],
+)
+filegroup(
+    name = 'objdump',
+    srcs = [
+        'bin/hexagon-llvm-objdump',
+    ],
+)
+filegroup(
+    name = 'strip',
+    srcs = [
+        'bin/hexagon-strip',
+    ],
+)
+filegroup(
+    name = 'as',
+    srcs = [
+        'bin/hexagon-llvm-mc',
+    ],
+)
+filegroup(
+    name = "compiler_pieces",
+    srcs = glob([
+        "libexec/**",
+        "lib/**",
+        "include/**",
+    ]),
+)
+filegroup(
+    name = "compiler_components",
+    srcs = [
+        ":ar",
+        ":as",
+        ":gcc",
+        ":ld",
+        ":nm",
+        ":objcopy",
+        ":objdump",
+        ":strip",
+    ],
+)
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -128,3 +128,22 @@ build:ubsan --copt -O0
 build:ubsan --copt -fno-omit-frame-pointer
 build:ubsan --linkopt -fsanitize=undefined
 build:ubsan --linkopt -lubsan
+# Usage example: bazel build --config hexagon_qualcomm
+build:hexagon_qualcomm --crosstool_top=//tools/hexagon_compiler:toolchain
+build:hexagon_qualcomm --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:hexagon_qualcomm --cpu=hexagon
+build:hexagon_qualcomm --copt=-std=c++98
+build:hexagon_qualcomm --copt -Wno-ignored-attributes
+build:hexagon_qualcomm --copt -Wno-unused-function
+build:hexagon_qualcomm --copt -Wno-sequence-point
+build:hexagon_qualcomm --copt -Wno-implicit-fallthrough
+build:hexagon_qualcomm --copt -Wno-missing-braces
+build:hexagon_qualcomm --copt -mv60
+build:hexagon_qualcomm --copt -Wno-cast-align
+build:hexagon_qualcomm --copt -Wpointer-arith
+build:hexagon_qualcomm --copt -Wnested-externs
+build:hexagon_qualcomm --copt -Wno-pointer-to-int-cast
+build:hexagon_qualcomm --copt -Wno-int-to-pointer-cast
+build:hexagon_qualcomm --copt -fno-rtti
+build:hexagon_qualcomm --copt -fno-exceptions
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -85,6 +85,7 @@ InOutDataType = Enum('InputDataType',
 FPDataTypeStrs = [
    "fp16_fp32",
    "fp32_fp32",
+    "bf16_fp32",
 ]
 FPDataType = Enum('GPUDataType', [(ele, ele) for ele in FPDataTypeStrs],
@@ -278,8 +279,10 @@ def get_model_files(model_config, model_output_dir):
    model_file_path = model_config[YAMLKeyword.model_file_path]
    model_sha256_checksum = model_config[YAMLKeyword.model_sha256_checksum]
    weight_file_path = model_config.get(YAMLKeyword.weight_file_path, "")
-    weight_sha256_checksum = model_config.get(YAMLKeyword.weight_sha256_checksum, "")  # noqa
+    weight_sha256_checksum = \
-    quantize_range_file_path = model_config.get(YAMLKeyword.quantize_range_file, "")  # noqa
+        model_config.get(YAMLKeyword.weight_sha256_checksum, "")
+    quantize_range_file_path = \
+        model_config.get(YAMLKeyword.quantize_range_file, "")
    model_file = model_file_path
    weight_file = weight_file_path
    quantize_range_file = quantize_range_file_path
@@ -808,7 +811,12 @@ def convert_func(flags):
    else:
        model_graph_format = configs.get(YAMLKeyword.model_graph_format,
                                         "file")
-    if model_graph_format == ModelFormat.code:
+    embed_graph_def = model_graph_format == ModelFormat.code
+    if flags.enable_micro:
+        mace_check((not embed_model_data) and (not embed_graph_def),
+                   ModuleName.YAML_CONFIG,
+                   "You should specify file mode to convert micro model.")
+    if embed_graph_def:
        os.makedirs(model_header_dir)
        sh_commands.gen_mace_engine_factory_source(
            configs[YAMLKeyword.models].keys(),
@@ -816,9 +824,16 @@ def convert_func(flags):
        sh.cp("-f", glob.glob("mace/codegen/engine/*.h"),
              model_header_dir)
-    convert.convert(configs, MODEL_CODEGEN_DIR)
+    convert.convert(configs, MODEL_CODEGEN_DIR, flags.enable_micro)
    for model_name, model_config in configs[YAMLKeyword.models].items():
+        if flags.enable_micro:
+            data_type = model_config.get(YAMLKeyword.data_type, "")
+            mace_check(data_type == FPDataType.fp32_fp32.value or
+                       data_type == FPDataType.bf16_fp32.value,
+                       ModuleName.YAML_CONFIG,
+                       "You should specify fp32_fp32 or bf16_fp32 data type "
+                       "to convert micro model.")
        model_codegen_dir = "%s/%s" % (MODEL_CODEGEN_DIR, model_name)
        encrypt.encrypt(model_name,
                        "%s/model/%s.pb" % (model_codegen_dir, model_name),
@@ -837,6 +852,9 @@ def convert_func(flags):
            sh.mv("-f",
                  '%s/model/%s.data' % (model_codegen_dir, model_name),
                  model_output_dir)
+            if flags.enable_micro:
+                sh.mv("-f", '%s/model/%s_micro.tar.gz' %
+                      (model_codegen_dir, model_name), model_output_dir)
        else:
            if not embed_model_data:
                sh.mv("-f",
@@ -1031,6 +1049,10 @@ def parse_args():
        'convert',
        parents=[all_type_parent_parser, convert_run_parent_parser],
        help='convert to mace model (file or code)')
+    convert.add_argument(
+        "--enable_micro",
+        action="store_true",
+        help="enable convert micro.")
    convert.set_defaults(func=convert_func)
    run = subparsers.add_parser(

--- a/tools/cpplint.sh
+++ b/tools/cpplint.sh
@@ -7,3 +7,5 @@ cpplint --linelength=80 --counting=detailed --root=include $(find include -name
 cpplint --linelength=80 --counting=detailed --root=test/ccutils $(find test/ccutils -name "*.h" -or -name "*.cc")
 cpplint --linelength=80 --counting=detailed --root=test/ccunit $(find test/ccunit -name "*.h" -or -name "*.cc")
 cpplint --linelength=80 --counting=detailed --root=test/ccbenchmark $(find test/ccbenchmark -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find ./micro -path ./micro/codegen -prune -o -name "*.h" -or -name "*.cc")
--- a/tools/hexagon_compiler/BUILD.bazel
+++ b/tools/hexagon_compiler/BUILD.bazel
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "hexagon|gcc": "cc-compiler-hexagon",
+    },
+)
+filegroup(
+    name = "hexagon_all_files",
+    srcs = [
+        "//tools/hexagon_compiler/hexagon_gcc:tool-wrappers",
+        "@hexagon_tools//:compiler_pieces",
+    ],
+)
+filegroup(
+    name = "hexagon_linker_files",
+    srcs = [
+        "//tools/hexagon_compiler/hexagon_gcc:ar",
+        "//tools/hexagon_compiler/hexagon_gcc:gcc",
+        "//tools/hexagon_compiler/hexagon_gcc:ld",
+        "@hexagon_tools//:compiler_pieces",
+    ],
+)
+filegroup(
+    name = "hexagon_compiler_files",
+    srcs = [
+        "//tools/hexagon_compiler/hexagon_gcc:as",
+        "//tools/hexagon_compiler/hexagon_gcc:gcc",
+        "//tools/hexagon_compiler/hexagon_gcc:ld",
+    ],
+)
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+cc_toolchain(
+    name = "cc-compiler-hexagon",
+    all_files = ":hexagon_all_files",
+    compiler_files = ":hexagon_compiler_files",
+    cpu = "hexagon",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":hexagon_linker_files",
+    objcopy_files = "//tools/hexagon_compiler/hexagon_gcc:objcopy",
+    static_runtime_libs = [":empty"],
+    strip_files = "//tools/hexagon_compiler/hexagon_gcc:strip",
+    supports_param_files = 1,
+    visibility = ["//visibility:public"],
+)
--- a/tools/hexagon_compiler/CROSSTOOL
+++ b/tools/hexagon_compiler/CROSSTOOL
+major_version: "local"
+minor_version: ""
+default_target_cpu: "hexagon"
+default_toolchain {
+  cpu: "hexagon"
+  toolchain_identifier: "hexagon-qualcomm"
+}
+toolchain {
+  abi_version: "gcc"
+  abi_libc_version: ""
+  builtin_sysroot: ""
+  compiler: "gcc"
+  host_system_name: "hexagon"
+  needsPic: true
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  #supports_normalizing_ar: true
+  supports_start_end_lib: false
+  supports_thin_archives: true
+  target_libc: ""
+  target_cpu: "hexagon"
+  target_system_name: ""
+  toolchain_identifier: "hexagon-qualcomm"
+  tool_path { name: "ar" path: "hexagon_gcc/hexagon-qualcomm-ar" }
+  tool_path { name: "compat-ld" path: "hexagon_gcc/hexagon-qualcomm-ld" }
+  tool_path { name: "cpp" path: "hexagon_gcc/hexagon-qualcomm-gcc" }
+  tool_path { name: "dwp" path: "hexagon_gcc/hexagon-qualcomm-dwp" }
+  tool_path { name: "gcc" path: "hexagon_gcc/hexagon-qualcomm-gcc" }
+  tool_path { name: "gcov" path: "hexagon_gcc/hexagon-qualcomm-gcov" }
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "hexagon_gcc/hexagon-qualcomm-ld" }
+  tool_path { name: "nm" path: "hexagon_gcc/hexagon-qualcomm-nm" }
+  tool_path { name: "objcopy" path: "hexagon_gcc/hexagon-qualcomm-objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "hexagon_gcc/hexagon-qualcomm-objdump" }
+  tool_path { name: "strip" path: "hexagon_gcc/hexagon-qualcomm-strip" }
+  linker_flag: "-Wl"
+  linker_flag: "-lm"
+  # Anticipated future default.
+  # This makes GCC and Clang do what we want when called through symlinks.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  linker_flag: "-no-canonical-prefixes"
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-fPIE"
+  linker_flag: "-v"
+  linker_flag: "-pie"
+  # for hexagon
+  linker_flag: "-march=hexagon"
+  linker_flag: "-mcpu=hexagonv60"
+  linker_flag: "-shared"
+  linker_flag: "-G0"
+  linker_flag: "-fPIC"
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified.
+  compiler_flag: "-fdiagnostics-color=always"
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  compilation_mode_flags {
+    mode: OPT
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+    compiler_flag: "-ffunction-sections"
+    linker_flag: "-Wl"
+  }
+}
--- a/tools/hexagon_compiler/hexagon_gcc/BUILD.bazel
+++ b/tools/hexagon_compiler/hexagon_gcc/BUILD.bazel
+package(default_visibility = ["//tools/hexagon_compiler:__pkg__"])
+filegroup(
+    name = "srcs",
+    srcs = glob(["**"]),
+)
+filegroup(
+    name = "gcc",
+    srcs = [
+        "hexagon-qualcomm-gcc",
+        "@hexagon_tools//:gcc",
+    ],
+)
+filegroup(
+    name = "ar",
+    srcs = [
+        "hexagon-qualcomm-ar",
+        "@hexagon_tools//:ar",
+    ],
+)
+filegroup(
+    name = "ld",
+    srcs = [
+        "hexagon-qualcomm-ld",
+        "@hexagon_tools//:ld",
+    ],
+)
+filegroup(
+    name = "nm",
+    srcs = [
+        "hexagon-qualcomm-nm",
+        "@hexagon_tools//:nm",
+    ],
+)
+filegroup(
+    name = "objcopy",
+    srcs = [
+        "hexagon-qualcomm-objcopy",
+        "@hexagon_tools//:objcopy",
+    ],
+)
+filegroup(
+    name = "objdump",
+    srcs = [
+        "hexagon-qualcomm-objdump",
+        "@hexagon_tools//:objdump",
+    ],
+)
+filegroup(
+    name = "strip",
+    srcs = [
+        "hexagon-qualcomm-strip",
+        "@hexagon_tools//:strip",
+    ],
+)
+filegroup(
+    name = "as",
+    srcs = [
+        "hexagon-qualcomm-as",
+        "@hexagon_tools//:as",
+    ],
+)
+filegroup(
+    name = "tool-wrappers",
+    srcs = [
+        ":ar",
+        ":as",
+        ":gcc",
+        ":ld",
+        ":nm",
+        ":objcopy",
+        ":objdump",
+        ":strip",
+    ],
+)
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-ar
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-ar
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-ar \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-ar \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-as
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-as
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-as \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-llvm-mc \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-gcc
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-gcc
+#!/bin/bash --norc
+exec \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-clang \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-gcov
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-gcov
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-gcov \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-coverage \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-ld
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-ld
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-ld \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-link \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-nm
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-nm
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-nm \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-nm \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-objcopy
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-objcopy
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-objcopy \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-elfcopy \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-objdump
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-objdump
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-objdump \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-llvm-objdump \
+    "$@"
--- a/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-strip
+++ b/tools/hexagon_compiler/hexagon_gcc/hexagon-qualcomm-strip
+#!/bin/bash --norc
+exec -a hexagon-qualcomm-strip \
+    ${HL_HEXAGON_TOOLS}/bin/hexagon-strip \
+    "$@"
--- a/tools/python/convert.py
+++ b/tools/python/convert.py
@@ -20,10 +20,9 @@ from __future__ import division
 from __future__ import print_function
 import argparse
+import copy
 import sys
-import numpy as np
+from micro_converter import MicroConverter
-import shutil
-import tempfile
 from utils import config_parser
 from utils.config_parser import DataFormat
 from utils.config_parser import DeviceType
@@ -32,7 +31,7 @@ from utils import util
 from utils.util import mace_check
 from utils.config_parser import normalize_model_config
 from utils.config_parser import ModelKeys
-from py_proto import mace_pb2
+from utils.convert_util import merge_params
 from transform import base_converter as cvt
 from transform import transformer
 from visualize import visualize_model
@@ -45,7 +44,7 @@ def transpose_shape(shape, dst_order):
    return t_shape
-def convert(conf, output):
+def convert(conf, output, enable_micro=False):
    if ModelKeys.quantize_stat in conf:
        quantize_stat = conf[ModelKeys.quantize_stat]
    else:
@@ -88,7 +87,12 @@ def convert(conf, output):
        model, params = merge_params(mace_model,
                                     model_conf[ModelKeys.data_type])
+        if enable_micro:
+            micro_converter = MicroConverter(model_conf, copy.deepcopy(model),
+                                             copy.deepcopy(params), model_name)
+            micro_converter.gen_code()
+            micro_converter.package(model_output + "/" +
+                                    model_name + "_micro.tar.gz")
        output_model_file = model_output + "/" + model_name + ".pb"
        output_params_file = model_output + "/" + model_name + ".data"
        with open(output_model_file, "wb") as f:
@@ -206,61 +210,6 @@ def convert_model(conf, quantize_stat):
    return output_graph_def
-def merge_params(net_def, data_type):
-    def tensor_to_bytes(tensor):
-        if tensor.data_type == mace_pb2.DT_HALF:
-            data = bytearray(
-                np.array(tensor.float_data).astype(np.float16).tobytes())
-            tensor.data_size = len(tensor.float_data)
-        elif tensor.data_type == mace_pb2.DT_FLOAT:
-            data = bytearray(
-                np.array(tensor.float_data).astype(np.float32).tobytes())
-            tensor.data_size = len(tensor.float_data)
-        elif tensor.data_type == mace_pb2.DT_INT32:
-            data = bytearray(
-                np.array(tensor.int32_data).astype(np.int32).tobytes())
-            tensor.data_size = len(tensor.int32_data)
-        elif tensor.data_type == mace_pb2.DT_UINT8:
-            data = bytearray(
-                np.array(tensor.int32_data).astype(np.uint8).tolist())
-            tensor.data_size = len(tensor.int32_data)
-        elif tensor.data_type == mace_pb2.DT_FLOAT16:
-            data = bytearray(
-                np.array(tensor.float_data).astype(np.float16).tobytes())
-            tensor.data_size = len(tensor.float_data)
-        else:
-            raise Exception('Tensor data type %s not supported' %
-                            tensor.data_type)
-        return data
-    model_data = []
-    offset = 0
-    for tensor in net_def.tensors:
-        if tensor.data_type == mace_pb2.DT_FLOAT:
-            tensor.data_type = data_type
-        raw_data = tensor_to_bytes(tensor)
-        if tensor.data_type != mace_pb2.DT_UINT8 and offset % 4 != 0:
-            padding = 4 - offset % 4
-            model_data.extend(bytearray([0] * padding))
-            offset += padding
-        tensor.offset = offset
-        model_data.extend(raw_data)
-        offset += len(raw_data)
-    for tensor in net_def.tensors:
-        if tensor.data_type == mace_pb2.DT_FLOAT \
-                or tensor.data_type == mace_pb2.DT_HALF \
-                or tensor.data_type == mace_pb2.DT_FLOAT16:
-            del tensor.float_data[:]
-        elif tensor.data_type == mace_pb2.DT_INT32:
-            del tensor.int32_data[:]
-        elif tensor.data_type == mace_pb2.DT_UINT8:
-            del tensor.int32_data[:]
-    return net_def, model_data
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(

--- a/tools/python/layers_validate.py
+++ b/tools/python/layers_validate.py
+# Copyright 2018 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import copy
+import os
+import sh
+import yaml
+from py_proto import mace_pb2
+from transform.base_converter import ConverterUtil
+from transform.base_converter import MaceKeyword
+from transform.base_converter import MaceOp
+from transform.hexagon_converter import HexagonOp
+from utils.util import mace_check
+def normalize_op_name(name):
+    return name.replace('/', '_').replace(':', '_')
+def handle_index(start, end, layers):
+    num_layers = end - start + 1
+    if ':' in layers:
+        start_index, end_index = layers.split(':')
+        start_index = int(start_index) if start_index else 0
+        end_index = int(end_index) if end_index else num_layers - 1
+    else:
+        start_index = int(layers)
+        end_index = start_index + 1
+    if start_index < 0:
+        start_index += num_layers
+    if end_index < 0:
+        end_index += num_layers
+    start_index += start
+    end_index += start
+    start_index = max(start, min(end - 1, start_index))
+    end_index = max(start + 1, min(end, end_index))
+    return start_index, end_index
+def save_model_to_proto(net_def, model_name, output_dir):
+    output_path = output_dir + "/" + model_name + ".pb"
+    with open(output_path, "wb") as f:
+        f.write(net_def.SerializeToString())
+    with open(output_path + "_txt", "w") as f:
+        f.write(str(net_def))
+    return output_path
+def convert(model_file, output_dir, layers):
+    mace_check(os.path.isfile(model_file),
+               "Input graph file '" + model_file + "' does not exist!")
+    mace_check(os.path.isdir(output_dir),
+               "Output directory '" + output_dir + "' does not exist!")
+    net_def = mace_pb2.NetDef()
+    with open(model_file, "rb") as f:
+        net_def.ParseFromString(f.read())
+    is_quantize = ConverterUtil.get_arg(
+        net_def, MaceKeyword.mace_quantize_flag_arg_str)
+    is_quantize = False if is_quantize is None else is_quantize.i == 1
+    is_hexagon = False
+    index = 0
+    end_index = len(net_def.op)
+    if is_quantize:
+        while index < end_index:
+            # omit op quantize
+            if net_def.op[index].type == MaceOp.Quantize.name or \
+                    net_def.op[index].type == \
+                    HexagonOp.QuantizeINPUT_f_to_8.name or \
+                    net_def.op[index].type == HexagonOp.INPUT.name:
+                index += 1
+            # omit op dequantize
+            elif net_def.op[end_index - 1].type == MaceOp.Dequantize.name or \
+                    net_def.op[end_index - 1].type == \
+                    HexagonOp.DequantizeOUTPUT_8tof.name or \
+                    net_def.op[end_index - 1].type == HexagonOp.OUTPUT.name:
+                end_index -= 1
+            else:
+                break
+        mace_check(0 < index < end_index < len(net_def.op),
+                   "Wrong number of op quantize(%d) or dequantize(%d)." %
+                   (index, len(net_def.op) - end_index))
+        if net_def.op[-1].type == HexagonOp.DequantizeOUTPUT_8tof.name or \
+                net_def.op[-1].type == HexagonOp.OUTPUT.name:
+            is_hexagon = True
+    index, end_index = handle_index(index, end_index, layers)
+    data_format = net_def.output_info[0].data_format
+    output_configs = {"subgraphs": []}
+    while index < end_index:
+        # omit BatchToSpaceND and op before that due to changed graph
+        if net_def.op[index].type == MaceOp.BatchToSpaceND.name or \
+                net_def.op[index].type == HexagonOp.BatchToSpaceND_8.name or \
+                (index + 1 < end_index and
+                 (net_def.op[index + 1].type == MaceOp.BatchToSpaceND.name or
+                  net_def.op[index + 1].type == HexagonOp.BatchToSpaceND_8.name)):  # noqa
+            index += 1
+            continue
+        net = copy.deepcopy(net_def)
+        if is_hexagon:
+            # reuse dequantize op and it's min/max tensor's node_id
+            del net.op[index+1:-1]
+        else:
+            del net.op[index+1:]
+        del net.output_info[:]
+        op = net.op[index]
+        index += 1
+        output_tensors = []
+        output_shapes = []
+        op_name = op.name
+        if is_quantize:
+            op.name = MaceKeyword.mace_output_node_name + '_' + op.name
+        if is_hexagon:
+            mace_check(len(op.output) == 1,
+                       "Only supports number of outputs of Hexagon op be 1.")
+        for i in range(len(op.output)):
+            output_tensors.append(str(op.output[i]))
+            output_shapes.append(
+                ",".join([str(dim) for dim in op.output_shape[i].dims]))
+            # modify output info
+            output_info = net.output_info.add()
+            output_info.name = op.output[i]
+            output_info.data_format = data_format
+            output_info.dims.extend(op.output_shape[i].dims)
+            output_info.data_type = mace_pb2.DT_FLOAT
+            if is_quantize:
+                output_info.scale = op.quantize_info[0].scale
+                output_info.zero_point = op.quantize_info[0].zero_point
+            # modify output op
+            if is_quantize:
+                output_name = op.output[i]
+                new_output_name = \
+                    MaceKeyword.mace_output_node_name + '_' + op.output[i]
+                op.output[i] = new_output_name
+                if not is_hexagon:
+                    dequantize_op = net.op.add()
+                    dequantize_op.name = normalize_op_name(output_name)
+                    dequantize_op.type = MaceOp.Dequantize.name
+                    dequantize_op.input.append(new_output_name)
+                    dequantize_op.output.append(output_name)
+                    output_shape = dequantize_op.output_shape.add()
+                    output_shape.dims.extend(op.output_shape[i].dims)
+                    dequantize_op.output_type.append(mace_pb2.DT_FLOAT)
+                    ConverterUtil.add_data_type_arg(dequantize_op,
+                                                    mace_pb2.DT_UINT8)
+                else:
+                    dequantize_op = net.op[-1]
+                    dequantize_op.name = normalize_op_name(output_name)
+                    del dequantize_op.input[:]
+                    del dequantize_op.output[:]
+                    dequantize_op.input.append(new_output_name)
+                    dequantize_op.node_input[0].node_id = op.node_id
+                    dequantize_op.output.append(output_name)
+                    if dequantize_op.type == \
+                            HexagonOp.DequantizeOUTPUT_8tof.name:
+                        input_min = new_output_name[:-1] + '1'
+                        input_max = new_output_name[:-1] + '2'
+                        dequantize_op.input.extend([input_min, input_max])
+                        dequantize_op.node_input[1].node_id = op.node_id
+                        dequantize_op.node_input[2].node_id = op.node_id
+                        del dequantize_op.node_input[3:]
+                    else:
+                        del dequantize_op.node_input[1:]
+        model_path = save_model_to_proto(net, normalize_op_name(op_name),
+                                         output_dir)
+        output_config = {"model_file_path": str(model_path),
+                         "output_tensors": output_tensors,
+                         "output_shapes": output_shapes}
+        output_configs["subgraphs"].append(output_config)
+    output_configs_path = output_dir + "outputs.yml"
+    with open(output_configs_path, "w") as f:
+        yaml.dump(output_configs, f, default_flow_style=False)
+def get_layers(model_dir, model_name, layers):
+    model_file = "%s/%s.pb" % (model_dir, model_name)
+    output_dir = "%s/output_models/" % model_dir
+    if os.path.exists(output_dir):
+        sh.rm('-rf', output_dir)
+    os.makedirs(output_dir)
+    convert(model_file, output_dir, layers)
+    output_configs_path = output_dir + "outputs.yml"
+    with open(output_configs_path) as f:
+        output_configs = yaml.load(f)
+    output_configs = output_configs['subgraphs']
+    return output_configs
+def parse_args():
+    """Parses command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        default="",
+        help="pb file to load.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="Directory to save the output graph to.")
+    parser.add_argument(
+        "--layers",
+        type=str,
+        default="-1",
+        help="'start_layer:end_layer' or 'layer', similar to python slice."
+             " Use with --validate flag.")
+    return parser.parse_known_args()
+if __name__ == '__main__':
+    FLAGS, _ = parse_args()
+    convert(FLAGS.model_file, FLAGS.output_dir, FLAGS.layers)
--- a/tools/python/micro/__init__.py
+++ b/tools/python/micro/__init__.py
--- a/tools/python/micro/graph_builder.py
+++ b/tools/python/micro/graph_builder.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from py_proto import micro_mem_pb2
+from utils.util import mace_check
+class GraphBuilder:
+    def __init__(self, pb_model, op_resolver):
+        self.net_def = pb_model
+        self.ops_desc_map = op_resolver.get_op_desc_map_from_model()
+        self.op_resolver = op_resolver
+        self.init_output_cache()
+        self.init_const_tensor_cache()
+        self.init_model_input_cache()
+    def get_op_idx(self, op_def):
+        if op_def.type not in self.ops_desc_map:
+            return -1
+        op_desc_list = self.ops_desc_map[op_def.type]
+        for op_desc in op_desc_list:
+            if self.op_resolver.op_def_desc_matched(op_def, op_desc):
+                return op_desc.idx
+        return -1
+    def init_output_cache(self):
+        model_outputs = []
+        for output_info in self.net_def.output_info:
+            model_outputs.append(output_info.name)
+        self.output_cache = {}
+        self.output_infos = []
+        for i in range(len(self.net_def.op)):
+            op_def = self.net_def.op[i]
+            for k in range(len(op_def.output)):
+                tensor_name = op_def.output[k]
+                output_info_uint = ((i & 0x0000ffff) << 16) | (k & 0x0000ffff)
+                if tensor_name in model_outputs:
+                    self.output_infos.append(output_info_uint)
+                else:
+                    self.output_cache[tensor_name] = output_info_uint
+    def init_const_tensor_cache(self):
+        self.const_tensor_cache = {}
+        for i in range(len(self.net_def.tensors)):
+            const_tensor = self.net_def.tensors[i]
+            self.const_tensor_cache[const_tensor.name] = \
+                (0xffff0000 | (i & 0x0000ffff))
+    def init_model_input_cache(self):
+        self.model_input_cache = {}
+        for i in range(len(self.net_def.input_info)):
+            input_info = self.net_def.input_info[i]
+            self.model_input_cache[input_info.name] = \
+                (0xfffe0000 | (i & 0x0000ffff))
+    def build(self):
+        graph = micro_mem_pb2.Graph()
+        graph.output_infos.extend(self.output_infos)
+        for op_def in self.net_def.op:
+            op_context = graph.op_contexts.add()
+            idx = self.get_op_idx(op_def)
+            mace_check(idx >= 0, "Error from the OpResolver.")
+            op_context.op_idx = idx
+            op_with_model_input = False
+            for input in op_def.input:
+                input_info = 0
+                if input in self.output_cache:
+                    input_info = self.output_cache[input]
+                elif input in self.const_tensor_cache:
+                    input_info = self.const_tensor_cache[input]
+                elif input in self.model_input_cache:
+                    input_info = self.model_input_cache[input]
+                    op_with_model_input = True
+                else:
+                    mace_check(False,
+                               "Model error: can not find input(%s)" % input)
+                op_context.input_infos.append(input_info)
+            if op_with_model_input:
+                graph.input_op_idxs.append(idx)
+            for output_shape in op_def.output_shape:
+                resize_shape = op_context.output_resize_shapes.add()
+                for dim in output_shape.dims:
+                    resize_shape.dims.append(dim)
+        return graph
--- a/tools/python/micro/jinja2_files/micro_engine_c_interface.cc.jinja2
+++ b/tools/python/micro/jinja2_files/micro_engine_c_interface.cc.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include "micro/codegen/engines/{{model_tag}}/micro_engine_c_interface.h"
+#include "micro/codegen/engines/{{model_tag}}/micro_engine_factory.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef micro::MaceMicroEngine MaceMicroEngine;
+typedef micro::MaceStatus MaceStatus;
+void *{{model_tag}}_GetMaceMicroEngineHandle() {
+  MaceMicroEngine *micro_engine = NULL;
+  micro::{{model_tag}}::GetMicroEngineSingleton(&micro_engine);
+  return micro_engine;
+}
+bool {{model_tag}}_RegisterInputData(void *handle, uint32_t idx,
+                                     const void *input_buffer,
+                                     const int32_t *input_dims) {
+  MaceMicroEngine *micro_engine = static_cast<MaceMicroEngine *>(handle);
+  MaceStatus status =
+      micro_engine->RegisterInputData(idx, input_buffer, input_dims);
+  return (status == micro::MACE_SUCCESS);
+}
+bool {{model_tag}}_Interpret(void *handle) {
+  MaceMicroEngine *micro_engine = static_cast<MaceMicroEngine *>(handle);
+  MaceStatus status = micro_engine->Run();
+  return (status == micro::MACE_SUCCESS);
+}
+bool {{model_tag}}_GetInterpretResult(void *handle, const uint32_t idx,
+                                      void **output_data,
+                                      const int32_t **output_dims,
+                                      uint32_t *output_dim_size) {
+  MaceMicroEngine *micro_engine = static_cast<MaceMicroEngine *>(handle);
+  MaceStatus status = micro_engine->GetOutputData(
+      idx, output_data, output_dims, output_dim_size);
+  return (status == micro::MACE_SUCCESS);
+}
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
--- a/tools/python/micro/jinja2_files/micro_engine_c_interface.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_engine_c_interface.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void *{{model_tag}}_GetMaceMicroEngineHandle();
+bool {{model_tag}}_RegisterInputData(void *handle, uint32_t idx,
+                                     const void *input_buffer,
+                                     const int32_t *input_dims);
+bool {{model_tag}}_Interpret(void *handle);
+bool {{model_tag}}_GetInterpretResult(void *handle, const uint32_t idx,
+                                      void **output_data,
+                                      const int32_t **output_dims,
+                                      uint32_t *output_dim_size);
+#ifdef __cplusplus
+}
+#endif
--- a/tools/python/micro/jinja2_files/micro_engine_config.cc.jinja2
+++ b/tools/python/micro/jinja2_files/micro_engine_config.cc.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include <stdint.h>
+#include "micro/framework/graph.h"
+#include "micro/include/public/micro.h"
+#include "micro/model/net_def.h"
+#include "micro/codegen/models/{{model_tag}}/micro_graph_data.h"
+#include "micro/codegen/models/{{model_tag}}/micro_model_data.h"
+#include "micro/codegen/models/{{model_tag}}/micro_net_def_data.h"
+#include "micro/codegen/models/{{model_tag}}/micro_ops_list.h"
+namespace micro {
+namespace {{model_tag}} {
+namespace {
+  uint8_t kTensorMem[{{ embed_data.tensor_mem_size }}] = {0};
+  uint8_t kScratchBuffer[{{ embed_data.scratch_buffer_size }}] = {0};
+  const void *kInputBuffers[{{ embed_data.input_size }}] = {NULL};
+  const int32_t *kInputShapes[{{ embed_data.input_size }}] = {NULL};
+  MaceMicroEngineConfig kMicroEngineConfig = {
+    reinterpret_cast<model::NetDef *>(kNetDef),
+    kModelData,
+    reinterpret_cast<framework::Graph *>(kGraphData),
+    kOpsArray,
+    kTensorMem,
+    kInputBuffers,
+    kInputShapes,
+    kScratchBuffer,
+    {{ embed_data.scratch_buffer_size }}
+  };
+}
+MaceMicroEngineConfig *GetMicroEngineConfig() {
+  return &kMicroEngineConfig;
+}
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_engine_factory.cc.jinja2
+++ b/tools/python/micro/jinja2_files/micro_engine_factory.cc.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include "micro/codegen/engines/{{model_tag}}/micro_engine_factory.h"
+namespace micro {
+namespace {{model_tag}} {
+namespace {
+MaceMicroEngine kMaceMicroEngine;
+bool kHasInit = false;
+}
+extern MaceMicroEngineConfig *GetMicroEngineConfig();
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine) {
+  MaceStatus status = MACE_SUCCESS;
+  if (!kHasInit) {
+    MaceMicroEngineConfig *engine_config = GetMicroEngineConfig();
+    status = kMaceMicroEngine.Init(engine_config);
+  }
+  if (status == MACE_SUCCESS) {
+    *engine = &kMaceMicroEngine;
+  }
+  return status;
+}
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_engine_factory.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_engine_factory.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include "micro/include/public/micro.h"
+namespace micro {
+namespace {{model_tag}} {
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_graph_data.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_graph_data.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include <stdint.h>
+namespace micro {
+namespace {{model_tag}} {
+uint8_t kGraphData[{{ data_size }}] = {
+  {% for d in embed_data %}{{"0x%02X, " % d }}{%endfor%}
+};
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_model_data.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_model_data.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include <stdint.h>
+namespace micro {
+namespace {{model_tag}} {
+const uint8_t kModelData[{{ data_size }}] = {
+  {% for d in embed_data %}{{"0x%02X, " % d }}{%endfor%}
+};
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_net_def.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_net_def.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+#include <stdint.h>
+namespace micro {
+namespace {{model_tag}} {
+uint8_t kNetDef[{{ data_size }}] = {
+  {% for d in embed_data %}{{"0x%02X, " % d }}{%endfor%}
+};
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/jinja2_files/micro_ops_list.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_ops_list.h.jinja2
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This is a generated file. DO NOT EDIT!
+{% for op_src_path in op_src_path_list %}
+{{ "#include \"%s\"" % op_src_path }}
+{%endfor%}
+namespace micro {
+namespace {{model_tag}} {
+namespace {
+{% for i in range(0, op_class_name_list_size) %}
+  {{ "ops::%s op%s;" % (op_class_name_list[i], i) }}
+{%endfor%}
+}  // namespace
+framework::Operator *kOpsArray[{{ data_size }}] = {
+{% for i in range(0, op_class_name_list_size) %}
+  {{ "&op%s," % i }}
+{%endfor%}
+};
+}  // namespace {{model_tag}}
+}  // namespace micro
--- a/tools/python/micro/mem_computer.py
+++ b/tools/python/micro/mem_computer.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from utils.convert_util import data_type_to_np_dt
+from utils.util import mace_check
+import numpy as np
+class MemBlock:
+    def __init__(self, tensor_name, offset, size):
+        self.tensor_name = tensor_name
+        self.offset = offset
+        self.size = size
+class MemComputer:
+    def __init__(self, net_def, np_data_type):
+        self.net_def = net_def
+        self.np_data_type = np_data_type
+        self.const_tensor_names = []
+        for const_tensor in net_def.tensors:
+            self.const_tensor_names.append(const_tensor.name)
+        self.input_names = []
+        for input_info in net_def.input_info:
+            self.input_names.append(input_info.name)
+    def init_computer(self):
+        self.free_mem_list = []
+        self.used_mem_list = []
+        self.buffer_size = 0
+        self.ref_counts = {}
+        for op in self.net_def.op:
+            for tensor_name in op.input:
+                if tensor_name in self.const_tensor_names or \
+                        tensor_name in self.input_names:
+                    continue
+                if tensor_name not in self.ref_counts:
+                    self.ref_counts[tensor_name] = 0
+                self.ref_counts[tensor_name] += 1
+    def get_mem_size(self, op, output_shape):
+        np_data_type = self.np_data_type
+        if len(op.output_type) > 0:
+            np_data_type = \
+                data_type_to_np_dt(op.output_type[0], self.np_data_type)
+        data_type_bytes = np.dtype(np_data_type).itemsize
+        if op.type == 'WinogradTransform' or op.type == 'GEMM':
+            mace_check(len(output_shape) == 4,
+                       "WinogradTransform and GEMM only support 4-dim")
+            mem_size = output_shape[2] * output_shape[3] * output_shape[0] \
+                * int((output_shape[1] + 3) / 4) * 4
+        else:
+            dim_size = len(output_shape)
+            if dim_size > 0:
+                mem_size = int((output_shape[dim_size - 1] + 3) / 4) * 4
+                for i in range(dim_size - 1):
+                    mem_size *= output_shape[i]
+            else:
+                print("the op %s's output dim size is 0" % op.type)
+                mem_size = 0
+        return mem_size * data_type_bytes
+    def remove_mem_block_by_name(self, mem_list, tensor_name):
+        return_mem_block = None
+        for mem_block in mem_list:
+            if tensor_name == mem_block.tensor_name:
+                return_mem_block = mem_block
+                mem_list.remove(mem_block)
+                break
+        return return_mem_block
+    def fake_new(self, op):
+        output_size = len(op.output)
+        for i in range(output_size):
+            mem_size = self.get_mem_size(op, op.output_shape[i].dims)
+            final_mem_block = None
+            reused = False
+            for mem_block in self.free_mem_list:
+                if mem_block.size >= mem_size:
+                    mem_block.tensor_name = op.output[i]
+                    final_mem_block = mem_block
+                    self.free_mem_list.remove(mem_block)
+                    mace_check(final_mem_block is not None,
+                               "Error: final_mem_block should not be None")
+                    reused = True
+                    # print("reuse a tensor mem: %s -> %s" %
+                    #       (mem_size, mem_block.size))
+                    break
+            if not reused:
+                final_mem_block = MemBlock(op.output[i], self.buffer_size,
+                                           mem_size)
+                self.buffer_size += mem_size
+                # print("new a tensor mem: %s" % final_mem_block.size)
+            # for micro, mem_id is mem_offset
+            op.mem_id.append(final_mem_block.offset)
+            self.used_mem_list.append(final_mem_block)
+    def fake_delete(self, op):
+        for tensor_name in op.input:
+            if tensor_name in self.const_tensor_names or \
+                    tensor_name in self.input_names:
+                continue
+            mace_check(tensor_name in self.ref_counts and
+                       self.ref_counts[tensor_name] > 0,
+                       "Invalid: ref_count is 0.")
+            self.ref_counts[tensor_name] -= 1
+            if self.ref_counts[tensor_name] is 0:
+                mem_block = self.remove_mem_block_by_name(
+                    self.used_mem_list, tensor_name)
+                mace_check(mem_block is not None,
+                           "error, can not find tensor: %s" % tensor_name)
+                self.free_mem_list.append(mem_block)
+                self.free_mem_list.sort(key=lambda mem_block: mem_block.size)
+    def fake_execute_op(self, op):
+        for i in range(len(op.output)):
+            self.fake_new(op)
+            self.fake_delete(op)
+    # return the tensor memory size needed by mace micro
+    def compute(self):
+        self.init_computer()
+        for op in self.net_def.op:
+            self.fake_execute_op(op)
+        return self.buffer_size
--- a/tools/python/micro/micro_codegen.py
+++ b/tools/python/micro/micro_codegen.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import os
+from jinja2 import Environment, FileSystemLoader
+JINJA2_DIR = './jinja2_files/'
+class MicroCodeGen:
+    def __init__(self):
+        pass
+    def gen_micro_ops_list_from_bytes(self, model_tag, op_src_path_list,
+                                      op_class_name_list,
+                                      jinja_file_name, output_path):
+        cwd = os.path.dirname(__file__)
+        j2_env = Environment(
+            loader=FileSystemLoader(cwd), trim_blocks=True)
+        template_name = JINJA2_DIR + jinja_file_name
+        source = j2_env.get_template(template_name).render(
+            model_tag=model_tag,
+            op_src_path_list=op_src_path_list,
+            op_class_name_list=op_class_name_list,
+            op_class_name_list_size=len(op_class_name_list)
+        )
+        with open(output_path, "w") as f:
+            f.write(source)
+    def gen_micro_source_from_bytes(self, model_tag, embed_data,
+                                    jinja_file_name, output_path):
+        cwd = os.path.dirname(__file__)
+        j2_env = Environment(
+            loader=FileSystemLoader(cwd), trim_blocks=True)
+        template_name = JINJA2_DIR + jinja_file_name
+        source = j2_env.get_template(template_name).render(
+            model_tag=model_tag,
+            embed_data=embed_data,
+            data_size=len(embed_data),
+        )
+        with open(output_path, "w") as f:
+            f.write(source)
+    def gen_net_def_data(self, model_tag, model_def_data, output_path):
+        embed_data = np.frombuffer(model_def_data, dtype=np.uint8)
+        self.gen_micro_source_from_bytes(
+            model_tag, embed_data, 'micro_net_def.h.jinja2', output_path)
+    def gen_graph_data(self, model_tag, graph_data, output_path):
+        embed_data = np.frombuffer(graph_data, dtype=np.uint8)
+        self.gen_micro_source_from_bytes(model_tag, embed_data,
+                                         'micro_graph_data.h.jinja2',
+                                         output_path)
+    def gen_ops_data(self, model_tag, op_src_path_list,
+                     op_class_name_list, output_path):
+        self.gen_micro_ops_list_from_bytes(model_tag, op_src_path_list,
+                                           op_class_name_list,
+                                           'micro_ops_list.h.jinja2',
+                                           output_path)
+    def gen_engin_config(self, model_tag, config_data, output_path):
+        self.gen_micro_source_from_bytes(model_tag, config_data,
+                                         'micro_engine_config.cc.jinja2',
+                                         output_path)
+    def gen_model_data(self, model_tag, model_param_data, output_path):
+        embed_data = np.frombuffer(model_param_data, dtype=np.uint8)
+        self.gen_micro_source_from_bytes(model_tag, embed_data,
+                                         'micro_model_data.h.jinja2',
+                                         output_path)
+    def gen_engine_factory(self, model_tag, output_path_h, output_path_cc):
+        self.gen_micro_source_from_bytes(model_tag, '',
+                                         'micro_engine_factory.h.jinja2',
+                                         output_path_h)
+        self.gen_micro_source_from_bytes(model_tag, '',
+                                         'micro_engine_factory.cc.jinja2',
+                                         output_path_cc)
+    def gen_engine_c_interface(self, model_tag, output_path_h, output_path_cc):
+        self.gen_micro_source_from_bytes(model_tag, '',
+                                         'micro_engine_c_interface.h.jinja2',
+                                         output_path_h)
+        self.gen_micro_source_from_bytes(model_tag, '',
+                                         'micro_engine_c_interface.cc.jinja2',
+                                         output_path_cc)
--- a/tools/python/micro/micro_io_converter.py
+++ b/tools/python/micro/micro_io_converter.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from py_proto import mace_pb2
+from transform.base_converter import MaceOp
+from utils.util import mace_check
+import copy
+class MicroIoConverter:
+    @staticmethod
+    def add_dt_cast_for_bf16(net_def):
+        bf16_net_def = copy.deepcopy(net_def)
+        op_num = len(bf16_net_def.op)
+        for i in range(op_num):
+            bf16_net_def.op.pop()
+        model_input = {}
+        for input_info in net_def.input_info:
+            model_input[input_info.name] = input_info.dims
+        model_output = {}
+        for output_info in net_def.output_info:
+            model_output[output_info.name] = output_info.dims
+        for op_def in net_def.op:
+            op_added = False
+            if len(model_input) > 0:
+                for i in range(len(op_def.input)):
+                    input_name = op_def.input[i]
+                    if input_name in model_input:
+                        if op_added:
+                            next_op = bf16_net_def.op.pop()
+                        else:
+                            next_op = copy.deepcopy(op_def)
+                            op_added = True
+                        op_cast = bf16_net_def.op.add()
+                        op_cast.name = MaceOp.Cast.name + "_op_" + input_name
+                        op_cast.type = MaceOp.Cast.name
+                        op_cast.input.append(input_name)
+                        trans_output_name = \
+                            MaceOp.Cast.name + "_out_" + input_name
+                        op_cast.output.append(trans_output_name)
+                        data_type_arg = op_cast.arg.add()
+                        data_type_arg.name = 'T'
+                        data_type_arg.i = mace_pb2.DT_FLOAT
+                        op_cast.output_type.append(mace_pb2.DT_BFLOAT16)
+                        output_shape = op_cast.output_shape.add()
+                        output_shape.dims.extend(model_input[input_name])
+                        next_op.input[i] = trans_output_name
+                        bf16_net_def.op.append(next_op)
+                        model_input.pop(input_name)
+            if len(model_output) > 0:
+                mace_check(len(op_def.output) == 1,
+                           "Not support output num > 1")
+                output_name = op_def.output[0]
+                if output_name in model_output:
+                    if not op_added:
+                        last_op = copy.deepcopy(op_def)
+                        op_added = True
+                    else:
+                        last_op = bf16_net_def.op.pop()
+                    last_op.output[0] = output_name + "_" + MaceOp.Cast.name
+                    bf16_net_def.op.append(last_op)
+                    op_cast = bf16_net_def.op.add()
+                    op_cast.name = MaceOp.Cast.name + "_op_" + output_name
+                    op_cast.type = MaceOp.Cast.name
+                    op_cast.input.append(last_op.output[0])
+                    op_cast.output.append(output_name)
+                    data_type_arg = op_cast.arg.add()
+                    data_type_arg.name = 'T'
+                    data_type_arg.i = mace_pb2.DT_BFLOAT16
+                    op_cast.output_type.append(mace_pb2.DT_FLOAT)
+                    output_shape = op_cast.output_shape.add()
+                    output_shape.dims.extend(model_output[output_name])
+                    model_output.pop(output_name)
+            if not op_added:
+                bf16_net_def.op.append(copy.deepcopy(op_def))
+        return bf16_net_def
+    @staticmethod
+    def convert(net_def, data_type):
+        if data_type == mace_pb2.DT_BFLOAT16:
+            print("data type is bfloat16, add input/output layers")
+            return MicroIoConverter.add_dt_cast_for_bf16(net_def)
+        else:
+            print("data type is %s" % data_type)
+        return net_def
--- a/tools/python/micro/micro_op_converter.py
+++ b/tools/python/micro/micro_op_converter.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transform.base_converter import ConverterUtil
+from transform.base_converter import DataFormat
+from transform.base_converter import MaceKeyword
+from transform.base_converter import MaceOp
+from utils.util import mace_check
+import numpy as np
+class MicroOpConverter:
+    def __init__(self, pb_model, model_weights, data_type=np.float32):
+        self.net_def = pb_model
+        self.model_weights = model_weights
+        self.weight_bytes = bytearray(model_weights)
+        self.data_type = data_type
+        self._consts = {}
+        for tensor in self.net_def.tensors:
+            self._consts[tensor.name] = tensor
+    def convert_filters_format(self):
+        arg_format = ConverterUtil.get_arg(self.net_def,
+                                           MaceKeyword.mace_filter_format_str)
+        mace_check(arg_format.i == DataFormat.OIHW.value, "Invalid model")
+        arg_format.i = DataFormat.OHWI.value
+        transposed_filter = set()
+        for op in self.net_def.op:
+            # OIHW => OHWI
+            if (op.type == MaceOp.Conv2D.name or
+                op.type == MaceOp.DepthwiseConv2d.name) and \
+                    op.input[1] not in transposed_filter:
+                print("transform filter: %s" % op.type)
+                filter = self._consts[op.input[1]]
+                tensor_data = np.frombuffer(self.weight_bytes, self.data_type,
+                                            filter.data_size, filter.offset)
+                filter_data = np.array(tensor_data).reshape(filter.dims) \
+                    .transpose(0, 2, 3, 1)
+                filter_bytes = np.array(filter_data).tobytes()
+                slice_end = filter.offset + len(filter_bytes)
+                self.model_weights[filter.offset: slice_end] = filter_bytes
+                filter.dims[:] = filter_data.shape
+                transposed_filter.add(op.input[1])
+    def convert_op_params(self):
+        self.convert_filters_format()
--- a/tools/python/micro/micro_support_ops.py
+++ b/tools/python/micro/micro_support_ops.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from py_proto import mace_pb2
+from transform.base_converter import MaceKeyword
+from transform.base_converter import MaceOp
+from utils.config_parser import DataFormat
+from utils.config_parser import ModelKeys
+from utils.config_parser import Platform
+from utils.util import mace_check
+import copy
+class OpDescriptor:
+    def __init__(self, src_path, class_name, type,
+                 data_type, data_format, tag=None):
+        self.src_path = src_path
+        self.class_name = class_name
+        self.type = type
+        self.data_type = data_type
+        self.data_format = data_format
+        self.tag = tag
+        self.name = None
+        self.idx = -1
+McSupportedOps = [
+    OpDescriptor('micro/ops/argmax.h', 'ArgMaxOp<mifloat>', MaceOp.ArgMax.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/conv_2d_ref.h', 'Conv2dRefOp',
+                 MaceOp.Conv2D.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, None),
+    OpDescriptor('micro/ops/nhwc/conv_2d_c4_s4.h', 'Conv2dC4S4Op',
+                 MaceOp.Conv2D.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'c4s4'),
+    OpDescriptor('micro/ops/nhwc/conv_2d_c3_s4.h', 'Conv2dC3S4Op',
+                 MaceOp.Conv2D.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'c3s4'),
+    OpDescriptor('micro/ops/nhwc/conv_2d_c2_s4.h', 'Conv2dC2S4Op',
+                 MaceOp.Conv2D.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'c2s4'),
+    OpDescriptor('micro/ops/cast.h', 'CastOp',
+                 MaceOp.Cast.name, mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/pooling_ref.h', 'PoolingRefOp',
+                 MaceOp.Pooling.name, mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/pooling_s4.h', 'PoolingS4Op',
+                 MaceOp.Pooling.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, "s4"),
+    OpDescriptor('micro/ops/squeeze.h', 'SqueezeOp', MaceOp.Squeeze.name,
+                 mace_pb2.DT_FLOAT, None),
+    OpDescriptor('micro/ops/softmax.h', 'SoftmaxOp', MaceOp.Softmax.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/eltwise.h', 'EltwiseOp<mifloat>',
+                 MaceOp.Eltwise.name, mace_pb2.DT_FLOAT, None),
+    OpDescriptor('micro/ops/eltwise.h', 'EltwiseOp<int32_t>',
+                 MaceOp.Eltwise.name, mace_pb2.DT_INT32, None),
+    OpDescriptor('micro/ops/activation.h', 'ActivationOp',
+                 MaceOp.Activation.name, mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/strided_slice.h', 'StridedSliceOp<mifloat>',
+                 MaceOp.StridedSlice.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC),
+    OpDescriptor('micro/ops/strided_slice.h', 'StridedSliceOp<int32_t>',
+                 MaceOp.StridedSlice.name, mace_pb2.DT_INT32,
+                 DataFormat.NHWC),
+    OpDescriptor('micro/ops/reduce.h', 'ReduceOp<mifloat>', MaceOp.Reduce.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/reduce.h', 'ReduceOp<int32_t>', MaceOp.Reduce.name,
+                 mace_pb2.DT_INT32, DataFormat.NHWC),
+    OpDescriptor('micro/ops/stack.h', 'StackOp<mifloat>', MaceOp.Stack.name,
+                 mace_pb2.DT_FLOAT, None),
+    OpDescriptor('micro/ops/stack.h', 'StackOp<int32_t>', MaceOp.Stack.name,
+                 mace_pb2.DT_INT32, None),
+    OpDescriptor('micro/ops/bias_add.h', 'BiasAddOp', MaceOp.BiasAdd.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/matmul.h', 'MatMulOp', MaceOp.MatMul.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/batch_norm.h', 'BatchNormOp',
+                 MaceOp.BatchNorm.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC),
+    OpDescriptor('micro/ops/shape.h', 'ShapeOp', MaceOp.Shape.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/reshape.h', 'ReshapeOp', MaceOp.Reshape.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/expand_dims.h', 'ExpandDimsOp',
+                 MaceOp.ExpandDims.name, mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_ref.h',
+                 'DepthwiseConv2dRefOp',
+                 MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC),
+    OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_kb4_s4.h',
+                 'DepthwiseConv2dKB4S4Op',
+                 MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'kb4s4'),
+    OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_kb3_s4.h',
+                 'DepthwiseConv2dKB3S4Op',
+                 MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'kb3s4'),
+    OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_kb2_s4.h',
+                 'DepthwiseConv2dKB2S4Op',
+                 MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'kb2s4'),
+    OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_kb1_s4.h',
+                 'DepthwiseConv2dKB1S4Op',
+                 MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
+                 DataFormat.NHWC, 'kb1s4'),
+]
+class OpResolver:
+    def __init__(self, pb_model, model_conf):
+        self.net_def = pb_model
+        self.op_desc_map = {}
+        self.op_desc_list = []
+        if model_conf[ModelKeys.platform] == Platform.TENSORFLOW:
+            self.default_data_format = DataFormat.NHWC
+        else:
+            self.default_data_format = DataFormat.NCHW
+        print("OpResolver set default_data_format: %s" %
+              self.default_data_format)
+        if ModelKeys.quantize in model_conf and \
+                model_conf[ModelKeys.quantize] == 1:
+            self.default_data_type = mace_pb2.DT_UINT8
+        else:
+            self.default_data_type = \
+                model_conf.get(ModelKeys.data_type, mace_pb2.DT_FLOAT)
+    def get_op_data_format(self, op_def):
+        arg = self.get_op_def_arg(op_def, MaceKeyword.mace_data_format_str)
+        if arg is None or arg.i == DataFormat.AUTO.value:
+            return self.default_data_format
+        else:
+            return DataFormat(arg.i)
+    def get_op_data_type(self, op_def):
+        arg = self.get_op_def_arg(op_def, MaceKeyword.mace_op_data_type_str)
+        if arg is None:
+            return self.default_data_type
+        else:
+            return arg.i
+    def get_op_def_arg(self, op_def, name):
+        for arg in op_def.arg:
+            if arg.name == name:
+                return arg
+        return None
+    def get_op_def_input_dims(self, op_def, idx):
+        input_name = op_def.input[idx]
+        for const_tensor in self.net_def.tensors:
+            if input_name == const_tensor.name:
+                return const_tensor.dims
+        for pre_op in self.net_def.op:
+            for i in range(len(pre_op.output)):
+                if input_name == pre_op.output[i]:
+                    return pre_op.output_shape[i].dims
+        return None
+    def get_op_tag(self, op_def):
+        if op_def.type == MaceOp.Conv2D.name:
+            output_shape = op_def.output_shape[0].dims
+            size = output_shape[0] * output_shape[1] * output_shape[2]
+            if size >= 4:
+                size = 4
+            channel = output_shape[3]
+            if channel >= 4:
+                channel = 4
+            if channel >= 2 and size >= 4:
+                return ("c%ss%s" % (channel, size))
+        elif op_def.type == MaceOp.DepthwiseConv2d.name:
+            output_shape = op_def.output_shape[0].dims
+            size = output_shape[0] * output_shape[1] * output_shape[2]
+            if size >= 4:
+                size = 4
+            filter_dims = self.get_op_def_input_dims(op_def, 1)
+            mace_check(filter_dims is not None, "Get filter dims failed.")
+            k_batch = filter_dims[0]
+            if k_batch >= 4:
+                k_batch = 4
+            if size >= 4:
+                return ("kb%ss%s" % (k_batch, size))
+        elif op_def.type == MaceOp.Pooling.name:
+            kernels = self.get_op_def_arg(op_def, MaceKeyword.mace_kernel_str)
+            mace_check(kernels is not None, "Get kernels failed.")
+            size = kernels.ints[0] * kernels.ints[1]
+            if size >= 4:
+                return "s4"
+        return None
+    def op_def_desc_type_matched(self, op_def, op_desc):
+        data_format_match = op_desc.data_format is None or \
+                            op_desc.data_format == \
+                            self.get_op_data_format(op_def)
+        if not data_format_match:
+            return False
+        op_data_type = self.get_op_data_type(op_def)
+        data_type_match = \
+            op_desc.data_type is None or \
+            op_desc.data_type == op_data_type or \
+            (op_desc.data_type == mace_pb2.DT_FLOAT and
+             (op_data_type == mace_pb2.DT_HALF or
+              op_data_type == mace_pb2.DT_FLOAT16 or
+              op_data_type == mace_pb2.DT_BFLOAT16))
+        if not data_type_match:
+            return False
+        op_tag = self.get_op_tag(op_def)
+        if op_tag != op_desc.tag:
+            return False
+        return True
+    def op_def_desc_matched(self, op_def, op_desc):
+        if not self.op_def_desc_type_matched(op_def, op_desc):
+            return False
+        return op_def.name == op_desc.name
+    def find_op_in_desc_map(self, op_def, op_desc_map):
+        if op_def.type not in op_desc_map:
+            return None
+        op_descs = op_desc_map[op_def.type]
+        for op_desc in op_descs:
+            if self.op_def_desc_type_matched(op_def, op_desc):
+                return op_desc
+        print("The op %s's data type can not be found in op_desc_map" %
+              op_def.type)
+        return None
+    def get_op_desc_map_from_model(self):
+        if len(self.op_desc_map) > 0:
+            return self.op_desc_map
+        op_desc_raw_map = {}
+        for i in range(len(McSupportedOps)):
+            op_desc = McSupportedOps[i]
+            if op_desc.type not in op_desc_raw_map:
+                op_desc_raw_map[op_desc.type] = []
+            op_desc_raw_map[op_desc.type].append(op_desc)
+        self.op_class_name_list = []
+        self.op_src_path_list = []
+        self.op_desc_map = {}
+        idx = 0
+        for op_def in self.net_def.op:
+            new_op_desc = None
+            op_desc = self.find_op_in_desc_map(op_def, self.op_desc_map)
+            if op_desc is None:
+                new_op_desc = self.find_op_in_desc_map(op_def, op_desc_raw_map)
+                mace_check(new_op_desc is not None,
+                           "not support op type %s, data type is %s, format is %s" %  # noqa
+                           (op_def.type, self.get_op_data_type(op_def),
+                            self.get_op_data_format(op_def)))
+                if op_def.type not in self.op_desc_map:
+                    self.op_desc_map[op_def.type] = []
+            else:
+                new_op_desc = copy.deepcopy(op_desc)
+            new_op_desc.name = op_def.name
+            new_op_desc.idx = idx
+            idx += 1
+            self.op_desc_map[op_def.type].append(new_op_desc)
+        return self.op_desc_map
+    def get_op_desc_list_from_model(self):
+        op_desc_map = self.get_op_desc_map_from_model()
+        op_desc_list = []
+        for op_descs in op_desc_map.values():
+            op_desc_list.extend(op_descs)
+        op_desc_list.sort(key=lambda op_desc: op_desc.idx)
+        op_class_name_list = [op_desc.class_name for op_desc in op_desc_list]
+        op_desc_list.sort(key=lambda op_desc: op_desc.src_path)
+        op_src_path_list = [op_desc.src_path for op_desc in op_desc_list]
+        return (list(set(op_src_path_list)), op_class_name_list)
--- a/tools/python/micro/proto_to_bytes.py
+++ b/tools/python/micro/proto_to_bytes.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from google.protobuf.descriptor import FieldDescriptor
+from utils.util import mace_check
+import sys
+import struct
+import tempfile
+if sys.version > '3':
+    import queue
+else:
+    import Queue as queue
+SimpleTypeArray = [
+    FieldDescriptor.TYPE_DOUBLE,
+    FieldDescriptor.TYPE_FLOAT,
+    FieldDescriptor.TYPE_INT64,
+    FieldDescriptor.TYPE_UINT64,
+    FieldDescriptor.TYPE_INT32,
+    FieldDescriptor.TYPE_BOOL,
+    FieldDescriptor.TYPE_UINT32,
+    FieldDescriptor.TYPE_ENUM,
+]
+# This type is string but it should be stored specially
+TYPE_STRING_EX = FieldDescriptor.MAX_TYPE + 1000
+TYPE_BYTES_EX = FieldDescriptor.MAX_TYPE + 1001
+TYPE_UINT16 = FieldDescriptor.MAX_TYPE + 1002
+class ObjInfo:
+    def __init__(self, obj, parent_addr, offset, type=None):
+        self.obj = obj
+        self.parent_addr = parent_addr
+        self.offset = offset
+        self.type = type
+class ProtoConverter:
+    def __init__(self, offset16=False, write_magic=False, exclude_fileds={}):
+        self.offset16 = offset16
+        self.write_magic = write_magic
+        self.exclude_fileds = exclude_fileds
+    # return the length of string with '\0'
+    def str_raw_len(self, str):
+        length = len(str)
+        if length > 0:
+            length += 1
+        return length
+    # return the string length which can by devided by 4
+    def str_pack_len(self, str):
+        return int((self.str_raw_len(str) + 3) / 4) * 4
+    def pack(self, value, pb_type):
+        if pb_type is FieldDescriptor.TYPE_INT32 or \
+                pb_type is FieldDescriptor.TYPE_INT64:
+            return struct.pack('<i', value)
+        elif pb_type is FieldDescriptor.TYPE_UINT32 or \
+                pb_type is FieldDescriptor.TYPE_ENUM or \
+                pb_type is FieldDescriptor.TYPE_UINT64:
+            return struct.pack('<I', value)
+        elif pb_type is FieldDescriptor.TYPE_BOOL:
+            return struct.pack('<i', (int)(value))
+        elif pb_type is FieldDescriptor.TYPE_FLOAT:
+            return struct.pack('<f', value)
+        elif pb_type is FieldDescriptor.TYPE_DOUBLE:
+            return struct.pack('<d', value)
+        elif pb_type is TYPE_UINT16:
+            return struct.pack('<H', value)
+        elif pb_type is FieldDescriptor.TYPE_STRING or \
+                pb_type is FieldDescriptor.TYPE_BYTES:
+            if isinstance(value, str):
+                value = bytes(value.encode('utf-8'))
+            length = self.str_raw_len(value)
+            if length == 0:
+                return b''
+            pack_length = self.str_pack_len(value)
+            empty_len = pack_length - length
+            while empty_len > 0:
+                value += b'\0'
+                empty_len -= 1
+            return struct.pack('<' + str(pack_length) + 's', value)
+        else:
+            mace_check(False,
+                       "The pack's pb_type is not supported: %s" % pb_type)
+    def get_pack_type(self):
+        pack_type = FieldDescriptor.TYPE_UINT32
+        if self.offset16:
+            pack_type = TYPE_UINT16
+        return pack_type
+    def bs_info_to_bytes(self, in_bytes, bs,
+                         object_queue, parent_addr, type):
+        length = self.str_pack_len(bs)
+        in_bytes += self.pack(length, self.get_pack_type())
+        offset = len(in_bytes)
+        in_bytes += self.pack(offset, self.get_pack_type())
+        if length > 0:
+            object_queue.put(ObjInfo(bs, parent_addr, offset, type))
+        return in_bytes
+    def string_info_to_bytes(self, in_bytes, string,
+                             object_queue, parent_addr):
+        return self.bs_info_to_bytes(in_bytes, string, object_queue,
+                                     parent_addr, FieldDescriptor.TYPE_STRING)
+    def bytes_info_to_bytes(self, in_bytes, bytes, object_queue, parent_addr):
+        return self.bs_info_to_bytes(in_bytes, bytes, object_queue,
+                                     parent_addr, FieldDescriptor.TYPE_BYTES)
+    def array_to_bytes(self, in_bytes, array,
+                       object_queue, parent_addr, descriptor):
+        length = len(array)
+        in_bytes += self.pack(length, self.get_pack_type())
+        offset = len(in_bytes)
+        in_bytes += self.pack(offset, self.get_pack_type())
+        if length > 0:
+            array_length = len(array)
+            for i in range(array_length):
+                # other units needn't write offset to their parent
+                array_parent_addr = parent_addr
+                if i > 0:
+                    array_parent_addr = -1
+                des_type = descriptor.type
+                if des_type is FieldDescriptor.TYPE_STRING:
+                    des_type = TYPE_STRING_EX
+                elif des_type is FieldDescriptor.TYPE_BYTES:
+                    des_type = TYPE_BYTES_EX
+                object_queue.put(
+                    ObjInfo(array[i], array_parent_addr, offset, des_type))
+        return in_bytes
+    def container_obj_to_bytes(self, obj_info, object_queue, parent_addr):
+        bytes = b''
+        if self.write_magic:
+            bytes = struct.pack('<4s', obj_info.obj.DESCRIPTOR.name[0:4])
+        for descriptor in obj_info.obj.DESCRIPTOR.fields:
+            if obj_info.obj.DESCRIPTOR.name in self.exclude_fileds and \
+                    descriptor.name in self.exclude_fileds[
+                obj_info.obj.DESCRIPTOR.name]:  # noqa
+                continue
+            value = getattr(obj_info.obj, descriptor.name)
+            if descriptor.label == descriptor.LABEL_REPEATED:
+                array = value
+                bytes = self.array_to_bytes(bytes, array, object_queue,
+                                            parent_addr, descriptor)
+            elif descriptor.type in SimpleTypeArray:
+                bytes += self.pack(value, descriptor.type)
+            elif descriptor.type is descriptor.TYPE_STRING:
+                bytes = self.string_info_to_bytes(bytes, value, object_queue,
+                                                  parent_addr)
+            elif descriptor.type is descriptor.TYPE_BYTES:
+                bytes = self.bytes_info_to_bytes(bytes, value, object_queue,
+                                                 parent_addr)
+            else:
+                mace_check(
+                    False,
+                    "The pb type is not supported: %s" % descriptor.type)
+        return bytes
+    def object_to_bytes(self, obj_info, object_queue, start_addr):
+        if hasattr(obj_info.obj, 'DESCRIPTOR'):
+            obj_bytes = self.container_obj_to_bytes(obj_info, object_queue,
+                                                    start_addr)
+        elif obj_info.type is FieldDescriptor.TYPE_STRING:
+            obj_bytes = self.pack(bytes(obj_info.obj.encode('utf-8')),
+                                  obj_info.type)
+        elif obj_info.type is FieldDescriptor.TYPE_BYTES:
+            obj_bytes = self.pack(obj_info.obj, obj_info.type)
+        elif obj_info.type is TYPE_STRING_EX:
+            obj_bytes = self.string_info_to_bytes(b'', obj_info.obj,
+                                                  object_queue, start_addr)
+        elif obj_info.type is TYPE_BYTES_EX:
+            obj_bytes = self.bytes_info_to_bytes(b'', obj_info.obj,
+                                                 object_queue, start_addr)
+        else:  # simple obj
+            obj_bytes = self.pack(obj_info.obj, obj_info.type)
+        return obj_bytes
+    def write_obj_queue_to_file(self, object_queue, f):
+        while not object_queue.empty():
+            obj_info = object_queue.get()
+            start_addr = f.tell()
+            bytes = self.object_to_bytes(obj_info, object_queue, start_addr)
+            f.write(bytes)
+            # write the obj's offset in its parent
+            if obj_info.parent_addr >= 0:
+                end_addr = f.tell()
+                f.seek(obj_info.parent_addr + obj_info.offset, 0)
+                f.write(self.pack(start_addr - obj_info.parent_addr,
+                                  self.get_pack_type()))
+                f.seek(end_addr, 0)
+    def proto_to_bytes(self, root_obj, ):
+        object_queue = queue.Queue()
+        object_queue.put(ObjInfo(root_obj, -1, -1))
+        with tempfile.TemporaryFile() as f:
+            self.write_obj_queue_to_file(object_queue, f)
+            f.seek(0)
+            return f.read()
+        return None
--- a/tools/python/micro/scratch_computer.py
+++ b/tools/python/micro/scratch_computer.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from py_proto import mace_pb2
+from utils.config_parser import ModelKeys
+from utils.util import mace_check
+from transform.base_converter import MaceKeyword
+from transform.base_converter import MaceOp
+class ScratchComputer:
+    def __init__(self, net_def, model_conf):
+        self.net_def = net_def
+        if ModelKeys.quantize in model_conf and \
+                model_conf[ModelKeys.quantize] == 1:
+            self.default_data_type = mace_pb2.DT_UINT8
+        else:
+            self.default_data_type = mace_pb2.DT_FLOAT
+        self._scratch_map = {
+            MaceOp.Conv2D: self.scratch_size_no_need,
+            MaceOp.Squeeze: self.scratch_size_of_squeeze,
+            MaceOp.Softmax: self.scratch_size_no_need,
+            MaceOp.Eltwise: self.scratch_size_no_need,
+            MaceOp.Activation: self.scratch_size_no_need,
+            MaceOp.StridedSlice: self.scratch_size_no_need,
+            MaceOp.Reduce: self.scratch_size_no_need,
+            MaceOp.Stack: self.scratch_size_no_need,
+            MaceOp.BiasAdd: self.scratch_size_no_need,
+            MaceOp.BatchNorm: self.scratch_size_no_need,
+            MaceOp.Shape: self.scratch_size_no_need,
+            MaceOp.Reshape: self.scratch_size_no_need,
+            MaceOp.ExpandDims: self.scratch_size_of_expand_dims,
+            MaceOp.MatMul: self.scratch_size_of_matmul,
+            MaceOp.Pooling: self.scratch_size_of_pooling,
+            MaceOp.DepthwiseConv2d: self.scratch_size_of_depthwise_conv,
+            MaceOp.ArgMax: self.scratch_size_no_need,
+            MaceOp.Cast: self.scratch_size_no_need,
+        }
+    def compute_size(self):
+        scratch_size = 1
+        for op_def in self.net_def.op:
+            mace_check(op_def.type in self._scratch_map,
+                       "The %s's scratch func is lost." % op_def.type)
+            size = self._scratch_map[op_def.type](op_def)
+            if scratch_size < size:
+                scratch_size = size
+        print("micro scatch buffer size is: %s" % scratch_size)
+        return scratch_size
+    def scratch_size_no_need(self, op_def):
+        return 0
+    def get_op_data_type(self, op_def):
+        arg = self.get_op_def_arg(op_def, MaceKeyword.mace_op_data_type_str)
+        if arg is None:
+            return self.default_data_type
+        else:
+            return arg.i
+    def get_data_bytes(self, data_type):
+        if data_type == mace_pb2.DT_FLOAT or \
+                data_type == mace_pb2.DT_INT32:
+            return 4
+        elif data_type == mace_pb2.DT_HALF or \
+                data_type == mace_pb2.DT_FLOAT16:
+            return 2
+        elif data_type == mace_pb2.DT_UINT8:
+            return 1
+        else:
+            mace_check(False, "Invalid data type: %s" % data_type)
+    def scratch_size_of_expand_dims(self, op_def):
+        output_dim_size = len(op_def.output_shape[0].dims)
+        data_type_bytes = self.get_data_bytes(mace_pb2.DT_INT32)
+        return output_dim_size * data_type_bytes
+    def scratch_size_of_matmul(self, op_def):
+        output_dim_size = len(op_def.output_shape[0].dims)
+        data_type_bytes = self.get_data_bytes(mace_pb2.DT_INT32)
+        return output_dim_size * data_type_bytes
+    def get_op_input_dims(self, op_def, idx):
+        input_name = op_def.input[idx]
+        for const_tensor in self.net_def.tensors:
+            if input_name == const_tensor.name:
+                return const_tensor.dims
+        for pre_op in self.net_def.op:
+            for i in range(len(pre_op.output)):
+                if pre_op.output[i] == input_name:
+                    return pre_op.output_shape[i].dims
+        return None
+    def scratch_size_of_pooling(self, op_def):
+        input0_dims = self.get_op_input_dims(op_def, 0)
+        channels = input0_dims[3]
+        mace_check(channels > 0,
+                   "can not inference pooling's input shape.")
+        int_bytes = self.get_data_bytes(mace_pb2.DT_INT32)
+        float_bytes = self.get_data_bytes(mace_pb2.DT_FLOAT)
+        return channels * (int_bytes + float_bytes)
+    def scratch_size_of_depthwise_conv(self, op_def):
+        filter_dims = self.get_op_input_dims(op_def, 1)
+        k_batch = filter_dims[0]
+        block_size = k_batch
+        if block_size > 4:
+            block_size = 4
+        k_channels = filter_dims[3]
+        float_bytes = self.get_data_bytes(mace_pb2.DT_FLOAT)
+        return block_size * 4 * k_channels * float_bytes
+    def scratch_size_of_squeeze(self, op_def):
+        input0_dims = self.get_op_input_dims(op_def, 0)
+        return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_FLOAT)
--- a/tools/python/micro_converter.py
+++ b/tools/python/micro_converter.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import numpy as np
+from micro.graph_builder import GraphBuilder
+from micro.mem_computer import MemComputer
+from micro.micro_codegen import MicroCodeGen
+from micro.micro_io_converter import MicroIoConverter
+from micro.micro_op_converter import MicroOpConverter
+from micro.micro_support_ops import OpResolver
+from micro.micro_support_ops import McSupportedOps
+from micro.proto_to_bytes import ProtoConverter
+from micro.scratch_computer import ScratchComputer
+from py_proto import mace_pb2
+from utils import util
+from utils.config_parser import ModelKeys
+from utils.convert_util import data_type_to_np_dt
+from utils.util import mace_check
+NetDefExcludeFields = {
+    'OperatorDef': [
+        'quantize_info',
+        'node_id',
+        'op_id',
+        'padding',
+        'node_input',
+        'out_max_byte_size',
+    ],
+}
+class MicroConverter:
+    def __init__(self, model_conf, net_def, model_weights,
+                 model_name, offset16=False, write_magic=False):
+        self.model_conf = model_conf
+        data_type = model_conf.get(ModelKeys.data_type, mace_pb2.DT_FLOAT)
+        self.net_def = MicroIoConverter.convert(net_def, data_type)
+        self.model_weights = model_weights
+        self.model_name = model_name
+        self.offset16 = offset16
+        self.write_magic = write_magic
+        self.code_gen = MicroCodeGen()
+        data_type = model_conf.get(ModelKeys.data_type, mace_pb2.DT_FLOAT)
+        self.np_data_type = data_type_to_np_dt(data_type, np.float32)
+        self.gen_folder = 'micro/codegen/'
+        util.mkdir_p(self.gen_folder)
+        self.op_resolver = OpResolver(self.net_def, self.model_conf)
+    def gen_code_from_model(self, model_name, pb_model, model_weights):
+        net_def = pb_model
+        output_dir = self.gen_folder + 'models/' + model_name + '/'
+        shutil.rmtree(output_dir, ignore_errors=True)
+        util.mkdir_p(output_dir)
+        # comput mem size and mem block offset and update the net_def,
+        # should count before ProtoConverter
+        mem_computer = MemComputer(net_def, self.np_data_type)
+        tensor_mem_size = mem_computer.compute()
+        # gen the c++ NetDef struct
+        net_def_converter = ProtoConverter(self.offset16, self.write_magic,
+                                           NetDefExcludeFields)
+        net_def_bytes = net_def_converter.proto_to_bytes(net_def)
+        mace_check(net_def_bytes is not None, "proto_to_bytes failed.")
+        self.code_gen.gen_net_def_data(model_name, net_def_bytes,
+                                       output_dir + 'micro_net_def_data.h')
+        # gen operator array
+        (op_src_path_list, op_class_name_list) = \
+            self.op_resolver.get_op_desc_list_from_model()
+        self.code_gen.gen_ops_data(
+            model_name, op_src_path_list, op_class_name_list,
+            output_dir + 'micro_ops_list.h')
+        # gen the c++ Graph struct
+        graph = GraphBuilder(net_def, self.op_resolver).build()
+        graph_converter = ProtoConverter(self.offset16, self.write_magic)
+        graph_bytes = graph_converter.proto_to_bytes(graph)
+        self.code_gen.gen_graph_data(model_name, graph_bytes,
+                                     output_dir + 'micro_graph_data.h')
+        scratch_buffer_size = ScratchComputer(
+            net_def, self.model_conf).compute_size()
+        # gen micro engine config
+        engine_data = {}
+        engine_data['tensor_mem_size'] = tensor_mem_size
+        engine_data['input_size'] = len(net_def.input_info)
+        engine_data['scratch_buffer_size'] = scratch_buffer_size
+        self.code_gen.gen_engin_config(model_name, engine_data,
+                                       output_dir + 'micro_engine_config.cc')
+        # gen micro model tensor data
+        tensor_bytes = bytearray(model_weights)
+        self.code_gen.gen_model_data(model_name, tensor_bytes,
+                                     output_dir + 'micro_model_data.h')
+    def gen_engine_interface_code(self, model_name):
+        output_dir = self.gen_folder + 'engines/' + model_name + '/'
+        shutil.rmtree(output_dir, ignore_errors=True)
+        util.mkdir_p(output_dir)
+        self.code_gen.gen_engine_factory(
+            model_name,
+            output_dir + 'micro_engine_factory.h',
+            output_dir + 'micro_engine_factory.cc')
+        self.code_gen.gen_engine_c_interface(
+            model_name,
+            output_dir + 'micro_engine_c_interface.h',
+            output_dir + 'micro_engine_c_interface.cc')
+    def gen_code(self):
+        MicroOpConverter(self.net_def, self.model_weights,
+                         self.np_data_type).convert_op_params()
+        self.gen_code_from_model(
+            self.model_name, self.net_def, self.model_weights)
+        self.gen_engine_interface_code(self.model_name)
+    def package(self, tar_package_path):
+        (op_h_path_list, op_class_name_list) = \
+            self.op_resolver.get_op_desc_list_from_model()
+        all_op_header_list = [op_desc.src_path for op_desc in McSupportedOps]
+        op_h_exclude_list = []
+        for op_header in all_op_header_list:
+            if op_header not in op_h_path_list:
+                op_h_exclude_list.append(op_header)
+        op_cc_exclude_list = \
+            [op_h.replace(".h", ".cc") for op_h in op_h_exclude_list]
+        exclude_list = ["--exclude=" + op_h for op_h in op_h_exclude_list]
+        exclude_list.extend(
+            ["--exclude=" + op_h for op_h in op_cc_exclude_list])
+        tmp_dir = "/tmp/micro"
+        tmp_workspace_file = "WORKSPACE"
+        os.system("mkdir -p %s && touch %s/%s" %
+                  (tmp_dir, tmp_dir, tmp_workspace_file))
+        tar_command = "tar --exclude=micro/tools --exclude=micro/test "
+        tar_command += " ".join(exclude_list)
+        tar_command += " -zcf " + tar_package_path
+        tar_command += " micro -C %s %s" % (tmp_dir, tmp_workspace_file)
+        os.system(tar_command)
--- a/tools/python/py_proto/__init__.py
+++ b/tools/python/py_proto/__init__.py
@@ -32,6 +32,10 @@ else:
        device.execute("bazel build //mace/proto:mace_py")
        device.execute("cp -f bazel-genfiles/mace/proto/mace_pb2.py %s" % cwd)
+        device.execute("bazel build //mace/proto:micro_mem_py")
+        device.execute(
+            "cp -f bazel-genfiles/mace/proto/micro_mem_pb2.py %s" % cwd)
        device.execute("bazel build //third_party/caffe:caffe_py")
        device.execute(
            "cp -f bazel-genfiles/third_party/caffe/caffe_pb2.py %s" % cwd)

--- a/tools/python/run_micro.py
+++ b/tools/python/run_micro.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import copy
+import numpy as np
+import shutil
+import tempfile
+from micro_converter import MicroConverter
+from py_proto import mace_pb2
+import run_target
+from utils import util
+from utils import device
+from utils import config_parser
+from utils.target import Target
+from utils.config_parser import ModelKeys
+from utils.util import MaceLogger
+from utils.util import mace_check
+import validate
+import layers_validate
+def join_2d_array(xs):
+    return ":".join([",".join([str(y) for y in x]) for x in xs])
+def build_engine(model_name, data_type):
+    mace_check(flags.model_name is not None and len(model_name) > 0,
+               "you should specify model name for build.")
+    command = "bazel build //micro/tools:micro_run_static" \
+              " --config optimization " \
+              " --copt \"-DMICRO_MODEL_NAME=%s\"" % model_name
+    if data_type == mace_pb2.DT_BFLOAT16:
+        command += " --copt \"-DMACE_ENABLE_BFLOAT16\""
+        print("The current engine's data type is bfloat16.")
+    device.execute(command)
+def get_model_conf_by_name(flags, conf):
+    for name, model_conf in conf["models"].items():
+        if not flags.model_name or name == flags.model_name:
+            return model_conf
+    return None
+def run_model(flags, args, conf):
+    model_conf = get_model_conf_by_name(flags, conf)
+    mace_check(model_conf is not None, "Get model conf failed.")
+    model_conf = config_parser.normalize_model_config(model_conf)
+    run_model_with_conf(flags, args, flags.model_name, model_conf)
+def gen_sub_model_conf(output_config, flags, conf):
+    model_conf = copy.deepcopy(get_model_conf_by_name(flags, conf))
+    model_conf['subgraphs'][0]['output_tensors'] = \
+        output_config['output_tensors']
+    model_conf['subgraphs'][0]['output_shapes'] = \
+        output_config['output_shapes']
+    return model_conf
+def run_layers_validate(flags, args, original_conf):
+    model_name = flags.model_name
+    original_model_dir = flags.output + "/" + \
+        original_conf['library_name'] + "/model"
+    model_dir = "/tmp/micro_run/model"
+    device.execute("mkdir -p %s" % model_dir)
+    device.execute("cp -p %s/%s.pb %s" %
+                   (original_model_dir, model_name, model_dir))
+    params_file_path = "%s/%s.data" % (original_model_dir, model_name)
+    output_configs = layers_validate.get_layers(
+        model_dir, model_name, flags.layers)
+    for i in range(len(output_configs)):
+        sub_model_conf = gen_sub_model_conf(
+            output_configs[i], flags, original_conf)
+        with open(output_configs[i]['model_file_path'], "rb") as model_file:
+            net_def = mace_pb2.NetDef()
+            net_def.ParseFromString(model_file.read())
+            with open(params_file_path, "rb") as params_file:
+                weights = bytearray(params_file.read())
+                micro_conf = \
+                    config_parser.normalize_model_config(sub_model_conf)
+                MicroConverter(micro_conf, net_def,
+                               weights, model_name).gen_code()
+                build_engine(model_name, micro_conf[ModelKeys.data_type])
+                run_model_with_conf(flags, args, model_name, micro_conf)
+def run_model_with_conf(flags, args, model_name, model_conf):
+    target_abi = "host"
+    dev = device.HostDevice("host", target_abi)
+    install_dir = "/tmp/micro_run/" + model_name
+    if ModelKeys.check_tensors in model_conf:
+        model_conf[ModelKeys.output_tensors] = model_conf[
+            ModelKeys.check_tensors]
+        model_conf[ModelKeys.output_shapes] = model_conf[
+            ModelKeys.check_shapes]
+    model_args = {"model_name": model_name,
+                  "input_node": ",".join(
+                      model_conf[ModelKeys.input_tensors]),
+                  "input_shape": join_2d_array(
+                      model_conf[ModelKeys.input_shapes]),
+                  "output_node": ",".join(
+                      model_conf[ModelKeys.output_tensors]),
+                  "output_shape": join_2d_array(
+                      model_conf[ModelKeys.output_shapes]),
+                  "input_data_format": ",".join(
+                      [df.name for df in
+                       model_conf[ModelKeys.input_data_formats]]),
+                  "output_data_format": ",".join(
+                      [df.name for df in
+                       model_conf[ModelKeys.output_data_formats]])
+                  }
+    opts = ["--%s=%s" % (arg_key, arg_val) for arg_key, arg_val in
+            model_args.items()] + args
+    # generate data start
+    tmp_dir_name = tempfile.mkdtemp()
+    input_file_prefix = tmp_dir_name + "/" + model_name
+    if ModelKeys.validation_inputs_data in model_conf:
+        input_tensor = model_conf[ModelKeys.input_tensors]
+        input_data = model_conf[ModelKeys.validation_inputs_data]
+        mace_check(len(input_tensor) == len(input_data),
+                   "len(input_tensor) != len(validate_data")
+        for i in range(len(input_tensor)):
+            util.download_or_get_file(
+                model_conf[ModelKeys.validation_inputs_data][i], "",
+                util.formatted_file_name(input_file_prefix,
+                                         input_tensor[i]))
+    else:
+        generate_input_data(input_file_prefix,
+                            model_conf[ModelKeys.input_tensors],
+                            model_conf[ModelKeys.input_shapes],
+                            model_conf[ModelKeys.input_ranges],
+                            model_conf[ModelKeys.input_data_types])
+    dev.install(Target(tmp_dir_name), install_dir + "/validate_in")
+    target_input_file = "%s/validate_in/%s" % (
+        install_dir, model_name)
+    target_output_dir = "%s/validate_out" % install_dir
+    dev.mkdir(target_output_dir)
+    target_output_file = target_output_dir + "/" + model_name
+    opts += ["--input_file=%s" % target_input_file,
+             "--output_file=%s" % target_output_file]
+    # generate data end
+    envs = []
+    if flags.vlog_level > 0:
+        envs += ["MACE_CPP_MIN_VLOG_LEVEL=%s" % flags.vlog_level]
+    target = Target("bazel-bin/micro/tools/micro_run_static", [],
+                    opts=opts, envs=envs)
+    run_target.run_target(target_abi, install_dir, target,
+                          device_ids="host")
+    if flags.validate:
+        validate_model_file = util.download_or_get_model(
+            model_conf[ModelKeys.model_file_path],
+            model_conf[ModelKeys.model_sha256_checksum],
+            tmp_dir_name)
+        validate_weight_file = ""
+        if ModelKeys.weight_file_path in model_conf:
+            validate_weight_file = util.download_or_get_model(
+                model_conf[ModelKeys.weight_file_path],
+                model_conf[ModelKeys.weight_sha256_checksum],
+                tmp_dir_name)
+        dev.pull(Target(target_output_dir), tmp_dir_name + "/validate_out")
+        output_file_prefix = tmp_dir_name + "/validate_out/" + model_name
+        validate.validate(model_conf[ModelKeys.platform],
+                          validate_model_file,
+                          validate_weight_file,
+                          input_file_prefix,
+                          output_file_prefix,
+                          model_conf[ModelKeys.input_shapes],
+                          model_conf[ModelKeys.output_shapes],
+                          model_conf[ModelKeys.input_data_formats],
+                          model_conf[ModelKeys.output_data_formats],
+                          model_conf[ModelKeys.input_tensors],
+                          model_conf[ModelKeys.output_tensors],
+                          flags.validate_threshold,
+                          model_conf[ModelKeys.input_data_types],
+                          flags.backend,
+                          "",
+                          "")
+    shutil.rmtree(tmp_dir_name)
+def generate_input_data(input_file, input_node, input_shape, input_ranges,
+                        input_data_type):
+    np.random.seed()
+    for i in range(len(input_node)):
+        data = np.random.random(input_shape[i]) * (
+            input_ranges[i][1] - input_ranges[i][0]) + input_ranges[i][0]
+        input_file_name = util.formatted_file_name(input_file, input_node[i])
+        MaceLogger.info('Generate input file: %s' % input_file_name)
+        if input_data_type[i] == mace_pb2.DT_FLOAT:
+            np_data_type = np.float32
+        elif input_data_type[i] == mace_pb2.DT_INT32:
+            np_data_type = np.int32
+        data.astype(np_data_type).tofile(input_file_name)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="",
+        help="yaml conf path"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="",
+        help="model name in yaml conf"
+    )
+    parser.add_argument(
+        "--validate",
+        action="store_true",
+        help="enable validate"
+    )
+    parser.add_argument(
+        "--validate_threshold",
+        type=float,
+        default="0.99",
+        help="validate threshold"
+    )
+    parser.add_argument(
+        "--layers",
+        type=str,
+        default="-1",
+        help="'start_layer:end_layer' or 'layer', similar to python slice."
+             " Use with --validate flag.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="tensorflow",
+        help="onnx backend framework")
+    parser.add_argument(
+        "--build",
+        action="store_true",
+        help="if build before run"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="build",
+        help="output dir")
+    parser.add_argument(
+        '--vlog_level',
+        type=int,
+        default="0",
+        help="vlog level")
+    return parser.parse_known_args()
+if __name__ == "__main__":
+    flags, args = parse_args()
+    conf = config_parser.parse(flags.config)
+    if flags.build or flags.validate:
+        micro_conf = config_parser.normalize_model_config(
+            conf[ModelKeys.models][flags.model_name])
+        build_engine(flags.model_name, micro_conf[ModelKeys.data_type])
+    if flags.validate and flags.layers != "-1":
+        run_layers_validate(flags, args, conf)
+    else:
+        run_model(flags, args, conf)
--- a/tools/python/template/file_binary.cc.jinja2
+++ b/tools/python/template/file_binary.cc.jinja2
@@ -20,7 +20,7 @@ namespace mace {
 const unsigned char *{{ load_func_name }}() {
 {% if data_size == 0 %}
-  return nullptr;
+  return NULL;
 {% else %}
  static const unsigned char kData[{{ data_size }}] = {
  {% for d in data %}{{"0x%02X, " % d }}{%endfor%}

--- a/tools/python/utils/config_parser.py
+++ b/tools/python/utils/config_parser.py
@@ -73,6 +73,7 @@ def parse_device_info(path):
 class ModelKeys(object):
    platform = "platform"
    runtime = "runtime"
+    models = 'models'
    graph_optimize_options = "graph_optimize_options"
    input_tensors = "input_tensors"
    input_shapes = "input_shapes"
@@ -175,6 +176,8 @@ def parse_data_type(str):
 def parse_internal_data_type(str):
    if str == 'fp32_fp32':
        return mace_pb2.DT_FLOAT
+    elif str == 'bf16_fp32':
+        return mace_pb2.DT_BFLOAT16
    else:
        return mace_pb2.DT_HALF
@@ -187,6 +190,8 @@ def to_list(x):
 def parse_int_array(xs):
+    if len(xs) is 0:
+        return [1]
    return [int(x) for x in xs.split(",")]
@@ -201,7 +206,6 @@ def normalize_model_config(conf):
        del conf[ModelKeys.subgraphs]
        conf.update(subgraph)
-    print(conf)
    conf[ModelKeys.platform] = parse_platform(conf[ModelKeys.platform])
    conf[ModelKeys.runtime] = parse_device_type(conf[ModelKeys.runtime])

--- a/tools/python/utils/convert_util.py
+++ b/tools/python/utils/convert_util.py
+# Copyright 2020 The MACE Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# python tools/python/convert.py \
+# --config ../mace-models/mobilenet-v2/mobilenet-v2.yml
+import array
+import numpy as np
+import struct
+from py_proto import mace_pb2
+def Float2BFloat16Bytes(float_data):
+    int_datas = []
+    for value in float_data:
+        bytes = struct.pack("f", value)
+        int_data = struct.unpack('i', bytes)[0]
+        int_datas.append(int_data >> 16)
+    return np.array(int_datas).astype(np.uint16).tobytes()
+def merge_params(net_def, data_type):
+    def tensor_to_bytes(tensor):
+        if tensor.data_type == mace_pb2.DT_HALF:
+            data = bytearray(
+                np.array(tensor.float_data).astype(np.float16).tobytes())
+            tensor.data_size = len(tensor.float_data)
+        elif tensor.data_type == mace_pb2.DT_FLOAT:
+            data = bytearray(
+                np.array(tensor.float_data).astype(np.float32).tobytes())
+            tensor.data_size = len(tensor.float_data)
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            data = bytearray(
+                np.array(tensor.int32_data).astype(np.int32).tobytes())
+            tensor.data_size = len(tensor.int32_data)
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            data = bytearray(
+                np.array(tensor.int32_data).astype(np.uint8).tolist())
+            tensor.data_size = len(tensor.int32_data)
+        elif tensor.data_type == mace_pb2.DT_FLOAT16:
+            data = bytearray(
+                np.array(tensor.float_data).astype(np.float16).tobytes())
+            tensor.data_size = len(tensor.float_data)
+        elif tensor.data_type == mace_pb2.DT_BFLOAT16:
+            data = Float2BFloat16Bytes(tensor.float_data)
+            tensor.data_size = len(tensor.float_data)
+        else:
+            raise Exception('Tensor data type %s not supported' %
+                            tensor.data_type)
+        return data
+    model_data = []
+    offset = 0
+    for tensor in net_def.tensors:
+        if tensor.data_type == mace_pb2.DT_FLOAT:
+            tensor.data_type = data_type
+        raw_data = tensor_to_bytes(tensor)
+        if tensor.data_type != mace_pb2.DT_UINT8 and offset % 4 != 0:
+            padding = 4 - offset % 4
+            model_data.extend(bytearray([0] * padding))
+            offset += padding
+        tensor.offset = offset
+        model_data.extend(raw_data)
+        offset += len(raw_data)
+    for tensor in net_def.tensors:
+        if tensor.data_type == mace_pb2.DT_FLOAT \
+                or tensor.data_type == mace_pb2.DT_HALF \
+                or tensor.data_type == mace_pb2.DT_FLOAT16\
+                or tensor.data_type == mace_pb2.DT_BFLOAT16:
+            del tensor.float_data[:]
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            del tensor.int32_data[:]
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            del tensor.int32_data[:]
+    return net_def, model_data
+def data_type_to_np_dt(data_type, default_np_dt):
+    if data_type is None:
+        return default_np_dt
+    elif data_type == mace_pb2.DT_HALF or data_type == mace_pb2.DT_FLOAT16:
+        return np.float16
+    elif data_type == mace_pb2.DT_INT32:
+        return np.int
+    elif data_type == mace_pb2.DT_UINT8:
+        return np.uint8
+    elif data_type == mace_pb2.DT_BFLOAT16:
+        return np.uint16
+    else:
+        return np.float32
--- a/tools/python/utils/device.py
+++ b/tools/python/utils/device.py
@@ -55,6 +55,8 @@ def execute(cmd, verbose=True):
        buf.append(line)
    if p.returncode != 0:
+        if verbose:
+            print(line)
        raise Exception("errorcode: %s" % p.returncode)
    return "\n".join(buf)
@@ -95,11 +97,11 @@ class HostDevice(Device):
        if install_dir.strip() and install_dir != os.path.dirname(target.path):
            execute("mkdir -p %s" % install_dir)
            if os.path.isdir(target.path):
-                execute("cp %s/* %s" % (target.path, install_dir))
+                execute("cp -f %s/* %s" % (target.path, install_dir))
            else:
-                execute("cp %s %s" % (target.path, install_dir))
+                execute("cp -f %s %s" % (target.path, install_dir))
            for lib in target.libs:
-                execute("cp %s %s" % (lib, install_dir))
+                execute("cp -f %s %s" % (lib, install_dir))
            target.path = "%s/%s" % (install_dir,
                                     os.path.basename(target.path))
@@ -117,7 +119,7 @@ class HostDevice(Device):
        out_dir = os.path.abspath(out_dir)
        if out_dir.strip() and out_dir != os.path.dirname(target.path):
-            execute("cp -r %s %s" % (target.path, out_dir))
+            execute("cp -rp %s %s" % (target.path, out_dir))
    def mkdir(self, dirname):
        execute("mkdir -p %s" % dirname)

--- a/tools/python/validate.py
+++ b/tools/python/validate.py
@@ -96,7 +96,7 @@ def compare_output(output_name, mace_out_value,
                util.StringFormatter.block("Similarity Test Passed"))
        else:
            util.MaceLogger.error(
-                "", util.StringFormatter.block("Similarity Test Failed"))
+                util.StringFormatter.block("Similarity Test Failed"))
    else:
        util.MaceLogger.error(
            "", util.StringFormatter.block(
@@ -110,6 +110,16 @@ def normalize_tf_tensor_name(name):
        return name
+def get_data_type_by_value(value):
+    data_type = value.dtype
+    if data_type == np.float32:
+        return mace_pb2.DT_FLOAT
+    elif data_type == np.int32:
+        return mace_pb2.DT_INT32
+    else:
+        return mace_pb2.DT_FLOAT
 def validate_with_file(output_names, output_shapes,
                       mace_out_file, validation_outputs_data,
                       validation_threshold, log_file):
@@ -182,7 +192,9 @@ def validate_tf_model(model_file,
                for i in range(len(output_names)):
                    output_file_name = util.formatted_file_name(
                        mace_out_file, output_names[i])
-                    mace_out_value = load_data(output_file_name)
+                    mace_out_value = load_data(
+                        output_file_name,
+                        get_data_type_by_value(output_values[i]))
                    if output_data_formats[i] == DataFormat.NCHW and \
                            len(output_shapes[i]) == 4:
                        mace_out_value = mace_out_value. \