feat: Integrate CMSIS5 to MACE Micro

6352c291 · Zhang Zhimin · e1f4fd86 · 6352c291 · 6352c291 · 6352c291
116 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -22,12 +22,14 @@ mace/codegen/version/
 mace/codegen/engine/
 mace/codegen/lib/

-micro/codegen/models/
-micro/codegen/engines/
-
 examples/android/macelibrary/src/main/cpp/mace/
 examples/android/macelibrary/src/main/cpp/include/
 examples/android/macelibrary/src/main/cpp/lib/arm64-v8a/
 examples/android/macelibrary/src/main/jniLibs/arm64-v8a/

 tools/python/py_proto/*_pb2.py
+
+micro/codegen/models/
+micro/codegen/engines/
+micro/examples/micro
+micro/build
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -19,7 +19,7 @@ cpplint:
 pylint:
  stage: linting
  script:
-    - pycodestyle $(find -name "*.py")
+    - pycodestyle . --filename=*.py --exclude=examples,third_party

 build_docs:
  stage: build
@@ -107,14 +107,12 @@ mace_cc_test:
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
    - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
-    - python tools/bazel_adb_run.py --target="//micro/test/ccunit:micro_ops_test" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a

 mace_cc_benchmark:
  stage: test
  script:
    - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
    - python tools/bazel_adb_run.py --target="//test/ccbenchmark:mace_cc_benchmark" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --args="--filter=.*SIGMOID.*"
-    - python tools/bazel_adb_run.py --target="//micro/test/ccbenchmark:micro_cc_benchmark" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a
  only:
    - triggers

@@ -141,14 +139,6 @@ model_tests:
    - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file
    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=code --model_data_format=file
    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=code --model_data_format=file --benchmark
-    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn.yml
-    - python tools/converter.py convert --config=${CONF_FILE} --enable_micro
-    - python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn
-    - python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --benchmark
-    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn-bf16.yml
-    - python tools/converter.py convert --config=${CONF_FILE} --enable_micro
-    - python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn
-    - rm -rf mace-models

 quantization_tests:
  stage: test
@@ -202,3 +192,4 @@ micro-child:
  trigger:
    include:
      - 'micro/.gitlab-ci.yml'
+    strategy: depend
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "micro/third_party/CMSIS_5"]
+	path = micro/third_party/CMSIS_5
+	url = https://github.com/ARM-software/CMSIS_5.git
+	shallow = true
+[submodule "micro/third_party/googletest"]
+	path = micro/third_party/googletest
+	url = https://github.com/google/googletest.git
+	shallow = true
+[submodule "micro/third_party/gflags"]
+	path = micro/third_party/gflags
+	url = https://github.com/gflags/gflags.git
+	shallow = true
+
--- a/docker/mace-micro-dev/mace-micro-dev.dockerfile
+++ b/docker/mace-micro-dev/mace-micro-dev.dockerfile
+FROM ubuntu:18.04
+
+RUN apt-get update
+RUN apt-get install -y wget
+RUN apt-get install -y g++ gcc
+RUN apt-get install -y gcc-arm-none-eabi
+RUN apt-get install -y python3 python3-pip git mercurial
+
+RUN wget https://cdn.cnbj1.fds.api.mi-img.com/mace/third-party/cmake-3.18.3-Linux-x86_64.sh
+RUN chmod +x cmake-3.18.3-Linux-x86_64.sh && ./cmake-3.18.3-Linux-x86_64.sh --skip-license --prefix=/usr
+
+RUN python3 -m pip install -U pip
+RUN python3 -m pip install jinja2 pyyaml sh numpy six filelock
+RUN python3 -m pip install tensorflow==2.3.0 tensorflow_model_optimization
+RUN python3 -m pip install mbed-cli
--- a/docs/micro-controllers/basic_usage.rst
+++ b/docs/micro-controllers/basic_usage.rst
 Basic usage for Micro Controllers
 ==================================

+MACE Micro is a lightweight neural network inference engine for MCUs and low-power DSPs.
+At now we support Cortex-M MCUs and Qualcomm Hexagon DSPs. You can get our projects from GitHub.

-Build and run an example model
-------------------------------
+Get MACE Micro Projects
+-----------------------

-At first, make sure the environment has been set up correctly already (refer to :doc:`../installation/env_requirement`).
+MACE Micro is a sub project of MACE, so you can get it from MACE.

-The followings are instructions about how to quickly build and run a provided model in
-`MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__.
+.. code-block:: sh

-Here we use the har-cnn model as an example.
+    git clone https://github.com/XiaoMi/mace.git
+    # Inits submodules by yourself
+    cd mace && git submodule update --init micro && cd ..

-**Commands**
+Environment Requirements
+------------------------

-    1. Pull `MACE <https://github.com/XiaoMi/mace>`__ project.
+On a ubuntu18.04/20.04 PC, do the following steps.

-    .. code-block:: sh
+.. code-block:: sh

-        git clone https://github.com/XiaoMi/mace.git
-        cd mace/
-        git fetch --all --tags --prune
+    apt-get update
+    apt-get install -y wget

-        # Checkout the latest tag (i.e. release version)
-        tag_name=`git describe --abbrev=0 --tags`
-        git checkout tags/${tag_name}
+    apt-get install -y g++
+    # Required for Cortex-M MCUs
+    apt-get install -y gcc-arm-none-eabi
+    apt-get install -y python3 python3-pip

-    .. note::
+    python3 -m pip install jinja2 pyyaml sh numpy six filelock
+    # Installs cmake above 3.13.0
+    wget https://cdn.cnbj1.fds.api.mi-img.com/mace/third-party/cmake-3.18.3-Linux-x86_64.sh
+    chmod +x cmake-3.18.3-Linux-x86_64.sh && ./cmake-3.18.3-Linux-x86_64.sh --skip-license --prefix=/usr

-        It's highly recommended to use a release version instead of master branch.
+    python3 -m pip install -U pip
+    # The Tensorflow version depends on your model
+    # The Tensroflow 1.x frozen model and Tensorflow 2.x Keras model are both supported
+    python3 -m pip install tensorflow==2.3.0
+    python3 -m pip install tensorflow_model_optimization

+You also can use a docker as the environment.

-    2. Pull `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ project.
+.. code-block:: sh

-    .. code-block:: sh
+    cd mace/docker/mace-micro-dev
+    docker build . -f mace-micro-dev.dockerfile --tag mace-micro-dev
+    cd ../../..
+    # Maps your workspace to docker container
+    docker run -ti -v $(pwd):/workspace/ -w /workspace  mace-micro-dev

-        git clone https://github.com/XiaoMi/mace-models.git

+Convert a model to c++ code
+----------------------------

-    3. Convert the pre-trained har-cnn model to c++ code.
+Here we use a pre-trained model of the MNIST database,

-    .. code-block:: sh
+.. code-block:: sh

-        cd path/to/mace
-        # output lib path: build/har-cnn/model/har_cnn_micro.tar.gz
-        CONF_FILE=/path/to/mace-models/micro-models/har-cnn/har-cnn.yml
-        python tools/python/convert.py --config=$CONF_FILE --enable_micro
+    cd mace
+    # Converts a tensorflow 2.x keras model, you need install python3 and tensorflow==2.x additional
+    python3 tools/python/convert.py --config=micro/pretrained_models/keras/mnist/mnist.yml --enable_micro


-    4. Build Micro-Controllers engine and models to library on host.
+Model config file
+-----------------

-    .. code-block:: sh
+The following is a completed model config file,

-        cd micro
-        ./tools/cmake/cmake-build-host.sh
+.. code-block:: sh

-    .. note::
+    library_name: har
+    target_abis: [host]
+    model_graph_format: file
+    model_data_format: file
+    models:
+    har_int8:
+        platform: keras
+        model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/har/har.h5
+        model_sha256_checksum: ec0477b8e489541bb34377c9cabc42ee6cefa8bdf0a9f726e06be1b967ea1dcd
+        subgraphs:
+        - input_tensors:
+            - "conv2d_1_input:0"
+            input_shapes:
+            - 1, 90, 3, 1
+            input_ranges:
+            - -5, 15
+            output_tensors:
+            - "dense_3/Softmax:0"
+            output_shapes:
+            - "1, 6"
+        runtime: cpu
+        data_type: fp32_fp32
+        limit_opencl_kernel_time: 0
+        nnlib_graph_mode: 0
+        obfuscate: 0
+        winograd: 0
+        quantize: 1
+        quantize_schema: int8
+        quantize_range_file: /workspace/mace/micro/pretrained_models/keras/har/har.range

-        - The build result ``build/cmake-build/host/libmicro.a``'s abi is host, if you want to run the model on micro controllers, you should build the code with the right toolchain, for example
+For the bfloat16 model,

-    .. code-block:: sh
-        cd micro
-        export HEXAGON_SDK_ROOT=/home/user/Qualcomm/Hexagon_SDK/3.4.1
-        export HEXAGON_TOOLS=/home/user/Qualcomm/HEXAGON_Tools/6.4.06
-        ./tools/cmake/cmake-build-hexagon6.sh
+.. code-block:: yaml

-    5. Run the model on host.
+    data_type: bf16_fp32

-    .. code-block:: sh
+For the int8 model,

-        CONF_FILE=/path/to/mace-models/micro-models/har-cnn/har-cnn.yml
-        # Run
-        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build
+.. code-block:: yaml

-    	# Test model run time
-        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --round=100
+    quantize: 1
+    quantize_schema: int8
+    # Required when your model has not quantize info
+    quantize_range_file: range_file_path

-    	# Validate the correctness by comparing the results against the
-    	# original model and framework, measured with cosine distance for similarity.
-    	python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --validate
-        # Validate the layers' correctness.
-        python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --validate --layers 0:-1


+Build MACE Micro and models libraries
+--------------------------------------

-Deploy your model into applications
+Here, we build the MACE Micro engine and models to libraries on a linux host machine. The CMake build parameters depends on your model config file.
+
+For float32 model,
+
+.. code-block:: sh
+
+    ./micro/tools/cmake/cmake-build-host.sh
+
+For bfloat16 model,
+
+.. code-block:: sh
+
+    ./micro/tools/cmake/cmake-build-host.sh -DMACE_MICRO_ENABLE_BFLOAT16=ON
+
+.. note::
+
+    You can only use either float32 or bfloat16
+
+For int8 model,
+
+.. code-block:: sh
+
+    ./micro/tools/cmake/cmake-build-host.sh -DMACE_MICRO_ENABLE_CMSIS=ON
+
+Use libraries directly
+-----------------------
+
+With these steps, we can find necessary libraries and headers in the "build/micro/host/install" directory, you can use the libraries directly.
+
+.. code-block:: sh
+
+    # Builds example
+    g++ micro/examples/classifier/main.cc -DMICRO_MODEL_NAME=mnist -DMICRO_DATA_NAME=mnist  -I build/micro/host/install/include/ -L build/micro/host/install/lib/ -lmicro  -lmodels -lmicro -o mnist
+    # Runs the mnist example
+    ./mnist
+
+
+Code example
 ------------------------------------

-Please refer to \ ``/mace/micro/tools/micro_run.cc`` for full usage. The following list the key steps.
+The following code is the mnist example source files, which the main steps is annotated

 .. code-block:: cpp

-    // Include the headers
-    #include "micro/include/public/micro.h"
-
-    // 1. Create MaceMicroEngine instance
-    MaceMicroEngine *micro_engine = nullptr;
-    MaceStatus status = har_cnn::GetMicroEngineSingleton(&micro_engine);
-
-    // 1. Create and register Input buffers
-    std::vector<std::shared_ptr<char>> inputs;
-    std::vector<int32_t> input_sizes;
-    for (size_t i = 0; i < input_shapes.size(); ++i) {
-      input_sizes.push_back(std::accumulate(input_shapes[i].begin(),
-                                            input_shapes[i].end(), sizeof(float),
-                                            std::multiplies<int32_t>()));
-      inputs.push_back(std::shared_ptr<char>(new char[input_sizes[i]],
-                                             std::default_delete<char[]>()));
-    }
-    // TODO: fill data into input buffers
-    for (size_t i = 0; i < input_names.size(); ++i) {
-      micro_engine->RegisterInputData(i, inputs[i].get(),
-                                      input_shapes[i].data());
+    #include "data/mnist.h"
+
+    #include <cstdio>
+
+    // Include MACE Micro header
+    #include "micro.h"
+
+    namespace micro {
+    namespace minst {
+
+    // We use forward declaration to avoid include the special engine header
+    MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+
    }
+    }  // namespace micro

-    // 3. Run the model
-    MaceStatus status = micro_engine->Run();
+    int main() {
+      // Step 1, get the mnist micro engine
+      micro::MaceMicroEngine *micro_engine = NULL;
+      micro::MaceStatus status =
+          micro::mnist::GetMicroEngineSingleton(&micro_engine);

-    // 4. Get the results
-    for (size_t i = 0; i < output_names.size(); ++i) {
-      void *output_buffer = nullptr;
-      const int32_t *output_dims = nullptr;
+      // Step 2, set input data
+      static float *input_data = data_mnist_4;
+      int32_t input_dims[4] = {1, 28, 28, 1};
+      micro_engine->RegisterInputData(0, input_data, input_dims);
+
+      // Step3, run the inference
+      micro_engine->Run();
+
+      // Step 4, get output data
+      float *output_buffer = NULL;
+      const int32_t *output_dims = NULL;
      uint32_t dim_size = 0;
-      MaceStatus status =
-          micro_engine->GetOutputData(i, &output_buffer, &output_dims, &dim_size);
-      // TODO: the result data is in output_buffer, you can not delete output_buffer.
+      micro_engine->GetOutputData(
+          0, reinterpret_cast<void **>(&output_buffer), &output_dims, &dim_size);
+
+      for (int32_t i = 0; i < output_dims[1]; ++i) {
+        printf("%d: %f\n", i, output_buffer[i]);
+      }
+
+      return 0;
    }
+
+For more examples, goto the directory "micro/examples"
+
+Performance
+-----------
+
+We deploy a `HAR-CNN <https://github.com/Shahnawax/HAR-CNN-Keras>`__ int8 model on the NUCLEO-F767ZI(Cortex-M7) board. Each inference of HAR CNN model takes 12 ms.
\ No newline at end of file
--- a/docs/micro-controllers/deploy.rst
+++ b/docs/micro-controllers/deploy.rst
+Deploy
+======
+
+MACE Micro module is written in c++98 and only depends on <cmath>.
+We can write a CMake toolchain file to build the program for the special platform.
+
+For Cortex-M MCU
+----------------
+
+Now we deploy the MNIST classifier example on a NUCLEO-F767ZI development with the Mbed OS.
+Install a GCC Arm Embedded compiler by the terminal.
+
+.. code-block:: sh
+
+    # Installs gcc arm
+    sudo apt-get install gcc-arm-none-eabi
+
+Refer to <https://os.mbed.com/docs/mbed-os/v6.3/build-tools/install-and-set-up.html/> to install Mbed OS tools.
+
+Now we can convert the model and build the program,
+
+.. code-block:: sh
+
+    python3 tools/python/convert.py --config=micro/pretrained_models/keras/mnist/mnist-int8.yml --enable_micro
+    ./micro/tools/cmake/cmake-build-gcc-arm-none-eabi.sh  -DARM_CPU=cortex-m7 -DMACE_MICRO_ENABLE_CMSIS=ON -DMACE_MICRO_ENABLE_HARDFP=OFF
+
+The "-DARM_CPU=cortex-{m7|m4|..}" is a necessary CMake variable for different series of Arm MCUs.
+You can use the Mace Micro install package("build/micro/gcc-arm-none-eabi/install") in yourself project. Here we use "mbed-cli" to compile it
+
+.. code-block:: sh
+
+    # cp the MACE Micro libraries to the workspace directory
+    cp build/micro/gcc-arm-none-eabi/install micro/examples/classifier -r
+    cd micro/examples/classifier
+    # Compile the program
+    mbed compile -t GCC_ARM -m NUCLEO_F767ZI -D MICRO_MODEL_NAME=mnist_int8 -D MICRO_DATA_NAME=mnist
+    # Flash the program to the development board
+    cp BUILD/NUCLEO_F767ZI/GCC_ARM/classifier.bin  /media/$USER/NODE_F767ZI
+    # Connet to the default COM port
+    sudo chown $USER:$USER  /dev/ttyACM0
+    mbed sterm
+
+Press the reset(black) button to run the example again.
+
+For Hexagon DSP
+---------------
+
+In the micro/cmake/toolchain folder, there are two hexagon CMake toolchain files for reference, For more details, please goto <https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor/>
\ No newline at end of file
--- a/docs/micro-controllers/op_lists.rst
+++ b/docs/micro-controllers/op_lists.rst
+Operator lists
+===============
+
+Float32 and bfloat16 operators
+
+* batch_norm
+* conv_2d
+* depthwise_conv_2d
+* pooling
+* activation
+* argmax
+* bias_add
+* cast
+* concat
+* eltwise
+* expand_dims
+* matmul
+* reduce
+* reshape
+* softmax
+* squeeze
+* stack
+* stride_slice
+
+Int8 operators
+
+* conv_2d
+* depthwsie_conv_2d
+* eltwise
+* mat_mul
+* pooling
+* softmax
+* quantize
+* dequantize
--- a/mace/proto/CMakeLists.txt
+++ b/mace/proto/CMakeLists.txt
-set(MACE_PROTO_PROTOS mace.proto)
-set(MACE_PROTO_SRCS)
-set(MACE_PROTO_HDRS)
-set(MACE_PROTO_PYTHON_DIR ${PROJECT_SOURCE_DIR}/tools/python/py_proto)
-
-foreach(proto_file ${MACE_PROTO_PROTOS})
+macro(generate_proto proto_file)
  get_filename_component(proto_file_abs ${proto_file} ABSOLUTE)
  get_filename_component(basename ${proto_file} NAME_WE)
-  set(PROTO_GENERATED_FILES ${basename}.pb.cc ${basename}.pb.h)

-  list(APPEND MACE_PROTO_SRCS ${basename}.pb.cc)
-  list(APPEND MACE_PROTO_HDRS ${basename}.pb.h)
+  set(${basename}_proto_files ${basename}.pb.cc ${basename}.pb.h)
+  set(${basename}_proto_srcs  ${basename}.pb.cc)

  add_custom_command(
-      OUTPUT ${PROTO_GENERATED_FILES}
+      OUTPUT ${basename}_proto_files
      COMMAND ${PROTOC_BIN} --cpp_out ${CMAKE_CURRENT_BINARY_DIR} -I ${CMAKE_CURRENT_SOURCE_DIR} ${proto_file_abs}
-      COMMENT "Generating ${PROTO_GENERATED_FILES} from ${proto_file}"
+      COMMENT "Generating ${basename}_proto_files from ${proto_file}"
      DEPENDS protoc_bin
      VERBATIM
  )

-  set(PROTO_GENERATED_PY_FILES ${MACE_PROTO_PYTHON_DIR}/${basename}_pb2.py)
+  set(PROTO_PYTHON_DIR ${PROJECT_SOURCE_DIR}/tools/python/py_proto)
+  set(PROTO_GENERATED_PY_FILES ${PROTO_PYTHON_DIR}/${basename}_pb2.py)
  add_custom_command(
      OUTPUT ${PROTO_GENERATED_PY_FILES}
-      COMMAND ${PROTOC_BIN} --python_out ${MACE_PROTO_PYTHON_DIR} -I ${CMAKE_CURRENT_SOURCE_DIR} ${proto_file_abs}
+      COMMAND ${PROTOC_BIN} --python_out ${PROTO_PYTHON_DIR} -I ${CMAKE_CURRENT_SOURCE_DIR} ${proto_file_abs}
      COMMENT "Generating ${PROTO_GENERATED_PY_FILES} from ${proto_file}"
      DEPENDS protoc_bin
      VERBATIM
  )
-endforeach()

-add_custom_target(mace_proto_src DEPENDS ${PROTO_GENERATED_FILES}
+  add_custom_target(${basename}_proto_cpp DEPENDS ${basename}_proto_files
        COMMENT "Checking if re-generation is required")
-add_custom_target(mace_proto_py ALL DEPENDS ${PROTO_GENERATED_PY_FILES})
+  add_custom_target(${basename}_proto_py ALL DEPENDS ${PROTO_GENERATED_PY_FILES})
+endmacro()
+
+generate_proto(mace.proto)
+generate_proto(micro_mem.proto)

-add_library(proto ${MACE_PROTO_SRCS})
+add_library(proto ${mace_proto_srcs})
+add_dependencies(proto mace_proto_cpp)
+set_source_files_properties(
+    ${mace_proto_srcs}
+    PROPERTIES GENERATED TRUE
+)
 target_link_libraries(proto libprotobuf_lite)

 install(TARGETS proto ARCHIVE DESTINATION lib)
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -16,6 +16,7 @@ enum DataType {
  DT_FLOAT16 = 5;
  DT_BFLOAT16 = 6;
  DT_INT16 = 7;
+  DT_INT8 = 8;
 }

 enum MemoryType {

--- a/micro/.gitignore
+++ b/micro/.gitignore
-build
-test/**/codegen
--- a/micro/.gitlab-ci.yml
+++ b/micro/.gitlab-ci.yml
+before_script:
+  - git submodule deinit -f .
+  - git submodule sync
+  - git submodule update --init .
+
 stages:
  - convert
  - build
@@ -6,23 +11,18 @@ stages:
 model-convert:
  stage: convert
  script:
-    - rm -rf mace-models
-    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
-    - >
-    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn.yml
-    - python tools/python/convert.py --config=${CONF_FILE} --enable_micro
+    - bash micro/tools/ci/model_convert.sh
  artifacts:
    paths:
      - mace-models
    untracked: true

-host-build:
+cross-build:
  stage: build
  script:
-    - cd micro && ./tools/cmake/cmake-build-host.sh -DMICRO_MODEL_NAME=har_cnn
+    - bash micro/tools/ci/cross_build.sh
+    - bash micro/tools/ci/host_build_and_run_examples.sh
+    - bash micro/tools/ci/host_build_and_run_tests.sh
+    # The mbed-cli protobuf version conflicts with others
+    # - bash micro/tools/ci/build_mbed_example.sh

-host-test:
-  stage: test
-  script:
-    - CONF_FILE=mace-models/micro-models/har-cnn/har-cnn.yml
-    - python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn
--- a/micro/CMakeLists.txt
+++ b/micro/CMakeLists.txt
-cmake_minimum_required(VERSION 3.7 FATAL_ERROR)
-message("CMAKE_VERSION: ${CMAKE_VERSION}")
-project(micro C CXX)
+cmake_minimum_required(VERSION 3.13 FATAL_ERROR)

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+project(micro C CXX ASM)
+
+# CMSIS_5 requires C99
+set(CMAKE_C_STANDARD 99)
+
+add_compile_options("-Wall;-Wextra")
+
+option(MACE_MICRO_ENABLE_CMSIS "Whether to enable cmsis driver" OFF)
+option(MACE_MICRO_ENABLE_BFLOAT16    "Whether to enable bfloat16 support"         OFF)
+option(MACE_MICRO_ENABLE_TESTS "Whether to enable Mace Micro tests"         OFF)
+option(MACE_MICRO_ENABLE_EXAMPLES "Whether to enable Mace Micro examples"         OFF)
+
+if(MACE_MICRO_GCC_ARM)
+  include(cmake/config_gcc_arm.cmake)
+endif()
+
+#set CMAKE_BUILD_TYPE default value as Release
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release"
+        CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel."
+        FORCE)
+endif()
+
+if(MACE_MICRO_ENABLE_CMSIS)
+  function(compilerSpecificCompileOptions PROJECTNAME ROOT)
+    target_compile_options(${PROJECTNAME}
+      PRIVATE "-Wno-unused-parameter"
+      PRIVATE "-Wno-sign-compare"
+      PRIVATE "-Wno-strict-aliasing"
+      PRIVATE "-Wno-unused-variable"
+    )
+  endfunction()
+  set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/third_party/CMSIS_5)
+
+  include_directories(${ROOT}/CMSIS/Core/Include)
+
+  add_subdirectory(${ROOT}/CMSIS/DSP/Source EXCLUDE_FROM_ALL)
+  add_subdirectory(${ROOT}/CMSIS/NN/Source  EXCLUDE_FROM_ALL)
+
+  target_include_directories(CMSISDSP INTERFACE ${ROOT}/CMSIS/Core/Include)
+  target_include_directories(CMSISNN INTERFACE ${ROOT}/CMSIS/Core/Include)
+  include_directories(third_party/CMSIS_5/CMSIS/Core/Include)
+endif()

 if(HEXAGON6)
  # Does not work with "-O3"
@@ -13,10 +53,7 @@ if(MACE_MICRO_ARM_NONE)
  add_definitions(-DMACE_MICRO_ARM_NONE)
 endif()

-option(MACE_ENABLE_BFLOAT16    "Whether to enable bfloat16 support"         OFF)
-option(MACE_MICRO_ENABLE_TESTS "Whether to enable Mace Micro tests"         ON)
-
-if(MACE_ENABLE_BFLOAT16)
+if(MACE_MICRO_ENABLE_BFLOAT16)
  add_definitions(-DMACE_ENABLE_BFLOAT16)
 endif()

@@ -24,15 +61,12 @@ if(MACE_MICRO_NDEBUG)
  add_definitions(-DMACE_MICRO_NDEBUG)
 endif()

-include(third_party/third_party.cmake)
-
 add_subdirectory(include)
 add_subdirectory(port)
 add_subdirectory(base)
 add_subdirectory(model)
 add_subdirectory(framework)
 add_subdirectory(ops)
-add_subdirectory(codegen)

 file(GLOB micro_base_srcs base/*.cc)
 file(GLOB micro_codegen_models_srcs codegen/models/**/*.cc)
@@ -41,15 +75,13 @@ file(GLOB micro_framework_srcs framework/*.cc)
 file(GLOB micro_models_srcs model/*.cc)
 file(GLOB micro_ops_nhwc_base_srcs ops/nhwc/base/*.cc)
 file(GLOB micro_ops_nhwc_srcs ops/nhwc/*.cc)
+file(GLOB micro_ops_nhwc_cmsis_nn_srcs ops/nhwc/cmsis_nn/*.cc)
 file(GLOB micro_ops_srcs ops/*.cc)
 file(GLOB micro_ops_utils_srcs ops/utils/*.cc)
 file(GLOB micro_port_srcs port/*.cc)

-# To build a single library
-add_library(micro
+list(APPEND micro_src
  ${micro_base_srcs}
-  ${micro_codegen_models_srcs}
-  ${micro_codegen_engines_srcs}
  ${micro_framework_srcs}
  ${micro_models_srcs}
  ${micro_ops_srcs}
@@ -58,22 +90,56 @@ add_library(micro
  ${micro_ops_utils_srcs}
  ${micro_port_srcs}
 )
-target_include_directories(micro PUBLIC ..)
+
+if(MACE_MICRO_ENABLE_CMSIS)
+  list(APPEND micro_src ${micro_ops_nhwc_cmsis_nn_srcs})
+endif()
+
+add_library(micro ${micro_src})
+target_include_directories(micro PUBLIC .. PUBLIC include/public)

 install(TARGETS micro
  ARCHIVE DESTINATION lib
  LIBRARY DESTINATION lib
  RUNTIME DESTINATION bin
 )
+install(FILES include/public/micro.h DESTINATION include)
+
+if(MACE_MICRO_ENABLE_CMSIS)
+  target_link_libraries(micro PRIVATE CMSISNN)
+
+  install(TARGETS
+    CMSISNNReshape
+    CMSISNNBasicMaths
+    CMSISNNConcatenation
+    CMSISNNFullyConnected
+    CMSISNNConvolutions
+    CMSISNNActivation
+    CMSISNNPooling
+    CMSISNNSoftmax
+    CMSISNNSupport
+    ARCHIVE DESTINATION lib
+    LIBRARY DESTINATION lib
+    RUNTIME DESTINATION bin
+  )
+endif()
+
+add_subdirectory(codegen)

 if(HEXAGON OR HEXAGON_STUB)
  include(cmake/find_hexagon_sdk.cmake)
 endif()

-if(NOT HEXAGON)
+if(NOT HEXAGON AND MICRO_MODEL_NAME)
+  add_subdirectory(third_party/gflags EXCLUDE_FROM_ALL)
  add_subdirectory(tools)
 endif()

 if(MACE_MICRO_ENABLE_TESTS)
+  add_subdirectory(third_party/googletest EXCLUDE_FROM_ALL)
  add_subdirectory(test)
-endif(MACE_MICRO_ENABLE_TESTS)
+endif()
+
+if(MACE_MICRO_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
--- a/micro/base/logger.cc
+++ b/micro/base/logger.cc
@@ -30,7 +30,7 @@ const int32_t kInt8ValueBufferLength = 4;
 const int32_t kFloatValueBufferLength = 21;

 inline bool IsValidLogLevel(const LogLevel level) {
-  return level >= CLEAN && level < INVALID_MAX;
+  return level < INVALID_MAX;
 }

 char LogLevelToShortStr(LogLevel level) {

--- a/micro/base/serialize_type.h
+++ b/micro/base/serialize_type.h
@@ -18,6 +18,7 @@
 #include <stdint.h>

 #include "micro/include/public/micro.h"
+#include "micro/include/port/define.h"

 namespace micro {


--- a/micro/base/utils.cc
+++ b/micro/base/utils.cc
@@ -105,5 +105,16 @@ float log(float x) {
  return ::log(x);
 }

+
+template <typename T>
+const T &max(const T &a, const T &b) {
+  return (a < b) ? b : a;
+}
+
+template <typename T>
+const T &min(const T &a, const T &b) {
+  return (a < b) ? a : b;
+}
+
 }  // namespace base
 }  // namespace micro
--- a/micro/cmake/config_gcc_arm.cmake
+++ b/micro/cmake/config_gcc_arm.cmake
+if(NOT ARM_CPU)
+  message(FATAL_ERROR "please set ARM_CPU, such as: -DARM_CPU=cortex-m4. We set -mcpu=${ARM_CPU}")
+endif()
+
+add_compile_options("-mcpu=${ARM_CPU};-mthumb")
+add_compile_options("-ffunction-sections;-fdata-sections")
+
+# floating-point ABI
+option(MACE_MICRO_ENABLE_HARDFP "Whether to use hard float-point ABI" ON)
+
+if(MACE_MICRO_ENABLE_HARDFP)
+  add_compile_options("-mfloat-abi=hard")
+else()
+  add_compile_options("-mfloat-abi=softfp")
+endif()
+
+# FPU
+if (ARM_CPU STREQUAL "cortex-m55" )
+  add_compile_options("-mfpu=fpv5-d16")
+  add_link_options("-mfpu=fpv5-d16")
+endif()
+
+if (ARM_CPU STREQUAL "cortex-m33" )
+  add_compile_options("-mfpu=fpv5-sp-d16")
+  add_link_options("-mfpu=fpv5-sp-d16")
+endif()
+
+if (ARM_CPU STREQUAL "cortex-m7" )
+  add_compile_options("-mfpu=fpv5-d16")
+  add_link_options("-mfpu=fpv5-d16")
+endif()
+
+if (ARM_CPU STREQUAL "cortex-m4" )
+  add_compile_options("-mfpu=fpv4-sp-d16")
+  add_link_options("-mfpu=fpv4-sp-d16")
+endif()
--- a/micro/cmake/toolchain/gcc-arm-none-eabi.cmake
+++ b/micro/cmake/toolchain/gcc-arm-none-eabi.cmake
-set(CMAKE_SYSTEM_PROCESSOR arm)
-
-set(CMAKE_C_COMPILER "${GCC_ARM_ROOT}/arm-none-eabi-gcc")
-set(CMAKE_CXX_COMPILER "${GCC_ARM_ROOT}/arm-none-eabi-g++")
-set(CMAKE_AR "${GCC_ARM_ROOT}/arm-none-eabi-ar" CACHE FILEPATH "Archiver")
-set(CMAKE_LINKER "${GCC_ARM_ROOT}/arm-none-eabi-ld")
-set(CMAKE_EXE_LINKER_FLAGS "--specs=nosys.specs" CACHE INTERNAL "")

-set(MACE_MICRO_ARM_NONE ON)
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_SYSTEM_PROCESSOR arm)

 set(CMAKE_FIND_ROOT_PATH "${GCC_ARM_ROOT}")
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+find_program(CMAKE_C_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
+find_program(CMAKE_CXX_COMPILER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
+find_program(CMAKE_ASM_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
+find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+find_program(CMAKE_LINKER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
+
+find_program(ELF2BIN NAMES arm-none-eabi-objcopy arm-none-eabi-objcopy.exe)
+
+# Force compiler settings
+SET(CMAKE_C_COMPILER_WORKS TRUE)
+SET(CMAKE_CXX_COMPILER_WORKS TRUE)
+
+set(MACE_MICRO_GCC_ARM ON)
--- a/micro/codegen/CMakeLists.txt
+++ b/micro/codegen/CMakeLists.txt
 file(GLOB_RECURSE generated_models_srcs models *.cc)
-add_library(generated_models
-  ${generated_models_srcs}
-)
-target_link_libraries(generated_models
-  micro_framework
-  micro_include
-  micro_model
-  micro_ops
-)
-
 file(GLOB_RECURSE micro_engine_srcs engines micro_engine_factory.cc)
-add_library(micro_engine
-  ${micro_engine_srcs}
-)
-target_link_libraries(micro_engine
-  micro_framework
-  micro_model
-  micro_ops
-  generated_models
-)
-
 file(GLOB_RECURSE micro_engine_c_srcs engines micro_engine_c_interface.cc)
-add_library(micro_engine_c
+
+#  Use ".keep.cc" as a source file when there are no model source files in "models" directory
+add_library(models
+  ${generated_models_srcs}
+  ${micro_engine_srcs}
  ${micro_engine_c_srcs}
 )
-target_link_libraries(micro_engine_c
-  micro_engine
+target_link_libraries(models
+  micro
 )
+
+install(TARGETS models
+  ARCHIVE DESTINATION lib
+)
\ No newline at end of file
--- a/micro/codegen/engines/.keep.cc
+++ b/micro/codegen/engines/.keep.cc
--- a/micro/codegen/models/.keep.cc
+++ b/micro/codegen/models/.keep.cc
--- a/micro/examples/CMakeLists.txt
+++ b/micro/examples/CMakeLists.txt
+add_subdirectory(classifier)
\ No newline at end of file
--- a/micro/examples/classifier/.gitignore
+++ b/micro/examples/classifier/.gitignore
+mbed-os
+BUILD
+install
+mbed_app.json
+__pycache__
+mbed_settings.py
\ No newline at end of file
--- a/micro/examples/classifier/.mbed
+++ b/micro/examples/classifier/.mbed
+TARGET=NUCLEO_F767ZI
+ROOT=.
--- a/micro/examples/classifier/CMakeLists.txt
+++ b/micro/examples/classifier/CMakeLists.txt
+
+if(NOT MICRO_MODEL_NAME OR NOT MICRO_DATA_NAME)
+  message(FATAL_ERROR "MICRO_MODEL_NAME or MICRO_DATA_NAME is undefined")
+endif()
+
+add_executable(${MICRO_MODEL_NAME} main.cc)
+target_compile_options(${MICRO_MODEL_NAME} PRIVATE "-Wno-error")
+target_link_libraries(${MICRO_MODEL_NAME} micro models)
+target_compile_definitions(${MICRO_MODEL_NAME} PRIVATE "-DMICRO_MODEL_NAME=${MICRO_MODEL_NAME}")
+target_compile_definitions(${MICRO_MODEL_NAME} PRIVATE "-DMICRO_DATA_NAME=${MICRO_DATA_NAME}")
--- a/micro/examples/classifier/data.h
+++ b/micro/examples/classifier/data.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_EXAMPLES_CLASSIFIER_DATA_H_
+#define MICRO_EXAMPLES_CLASSIFIER_DATA_H_
+
+#include "data/har.h"
+#include "data/kws.h"
+#include "data/mnist.h"
+#include "stdint.h"
+
+namespace mnist {
+const float *input = data_mnist_4;
+const int32_t input_dims[4] = {1, 28, 28, 1};
+}  // namespace mnist
+
+namespace har {
+const float *input = data_har_standing;
+const int32_t input_dims[4] = {1, 90, 3, 1};
+}  // namespace har
+
+namespace kws {
+const float *input = data_kws_yes;
+const int32_t input_dims[4] = {1, 98, 40, 1};
+}  // namespace kws
+
+#endif  // MICRO_EXAMPLES_CLASSIFIER_DATA_H_
--- a/micro/examples/classifier/data/har.h
+++ b/micro/examples/classifier/data/har.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_EXAMPLES_DATA_HAR_H_
+#define MICRO_EXAMPLES_DATA_HAR_H_
+
+static float data_har_jogging[270] = {
+    5.012288,    11.264028,  0.95342433,  -0.6946377,  12.680544,  0.50395286,
+    4.903325,    10.882658,  -0.08172209, -0.61291564, 18.496431,  3.0237172,
+    -1.1849703,  12.108489,  7.205164,    1.3756552,   -2.4925237, -6.510526,
+    -0.61291564, 10.56939,   5.706926,    -0.50395286, 13.947236,  7.0553403,
+    -8.430995,   11.413852,  5.134871,    0.95342433,  1.3756552,  1.6480621,
+    -8.19945,    19.57244,   2.7240696,   1.4165162,   5.7886477,  2.982856,
+    -1.879608,   -2.982856,  -0.29964766, -6.1291566,  6.851035,   -8.158588,
+    5.829509,    18.0061,    8.539958,    6.2789803,   2.982856,   2.9147544,
+    -1.56634,    8.308413,   -1.4573772,  3.5276701,   13.593107,  9.425281,
+    -2.0294318,  -5.706926,  -10.18802,   2.7649305,   10.337844,  -9.724928,
+    3.568531,    13.6748295, 1.5390993,   -0.50395286, 3.8681788,  3.718355,
+    -2.3018389,  1.6889231,  0.08172209,  -3.568531,   19.57244,   6.510526,
+    -0.8036005,  -3.2961242, -4.630918,   0.50395286,  10.841797,  13.525005,
+    5.706926,    15.595298,  6.1700177,   -8.662541,   7.273266,   4.0180025,
+    -1.334794,   1.2258313,  2.3699405,   -4.5900574,  19.57244,   4.7126403,
+    3.8681788,   3.759216,   0.84446156,  -1.7978859,  1.5390993,  8.730643,
+    7.668256,    11.264028,  -1.3075534,  -2.3699405,  14.2877445, 8.281172,
+    2.7240696,   1.4573772,  0.88532263,  -3.5957718,  18.659876,  -0.6537767,
+    3.9499009,   4.140586,   3.990762,    0.46309182,  -2.4108016, 2.4108016,
+    3.7864566,   14.137921,  -3.1463003,  3.336985,    19.231932,  6.5513873,
+    5.6660647,   3.7864566,  0.53119355,  0.23154591,  0.7627395,  0.7627395,
+    -4.8216033,  19.57244,   8.158588,    1.8387469,   -1.1168685, -2.7921712,
+    -3.2961242,  10.079058,  13.824653,   11.604536,   17.079916,  1.334794,
+    -3.173541,   14.015338,  5.706926,    0.61291564,  1.1168685,  2.5606253,
+    -7.8861814,  19.57244,   1.9885708,   3.1463003,   5.243834,   4.671779,
+    -3.0237172,  -4.3312707, -3.336985,   -0.08172209, 11.917805,  -7.8861814,
+    -1.0351465,  14.818938,  4.6036777,   -2.4516625,  2.5333846,  3.486809,
+    -1.3756552,  2.070293,   -0.19068487, -2.4925237,  19.57244,   6.469665,
+    1.4573772,   -5.243834,  -4.372132,   -1.4165162,  9.80665,    5.7477865,
+    -1.2666923,  14.709975,  6.2108784,   -3.6774938,  3.173541,   3.7864566,
+    1.8387469,   2.7649305,  -1.7570249,  -1.2666923,  19.313654,  6.3198414,
+    2.4108016,   -7.6546354, -6.1291566,  -0.61291564, 16.358038,  4.944186,
+    0.040861044, 17.502148,  2.5333846,   -7.6546354,  7.8180795,  4.372132,
+    -1.2666923,  0.7218784,  0.8036005,   -5.012288,   19.57244,   5.5162406,
+    1.9477097,   2.7921712,  2.070293,    -5.053149,   1.6480621,  7.6273947,
+    9.384419,    13.443283,  1.0351465,   -5.434519,   13.211738,  6.4424243,
+    -0.61291564, 1.879608,   1.4165162,   4.7126403,   -6.5513873, -6.0201936,
+    -1.7570249,  9.302697,   -6.428804,   -0.9125633,  10.501288,  -0.27240697,
+    2.6014864,   19.381754,  4.440233,    5.7886477,   3.214402,   1.1441092,
+    -1.9885708,  12.4489975, -2.7240696,  1.4165162,   16.780268,  8.471856,
+    0.42223078,  -8.267551,  -7.3549876,  -3.568531,   10.95076,   -0.8036005,
+    -4.671779,   11.727119,  0.38136974,  -2.1383946,  1.6889231,  3.5276701,
+    -1.334794,   2.4925237,  -0.3405087,  -2.9147544,  19.57244,   7.5865335,
+    3.5276701,   -3.9499009, -1.920469,   -4.0588636,  10.038197,  14.2877445};
+
+
+static float data_har_walking[270] = {
+    -0.99, 11.45, -3.0645783,   1.18,  14.94, -3.718355,
+    1.27,  13.82, -1.2258313,   -0.15, 11.14, -2.1111538,
+    -1.38, 8.05,  -0.84446156,  -1.99, 5.94,  0.14982383,
+    -0.08, 4.94,  0.88532263,   -0.27, 4.14,  2.2609777,
+    -3.26, 6.44,  4.1814466,    -5.75, 13.02, 7.273266,
+    -2.37, 10.65, 8.008764,     -0.46, 15.94, 0.7218784,
+    1.8,   6.13,  -1.1168685,   -4.75, 10.84, -3.0645783,
+    -1.46, 8.39,  0.88532263,   1.33,  7.78,  -0.46309182,
+    -3.72, 8.47,  -0.7218784,   -3.72, 8.47,  -0.7218784,
+    -1.88, 7.63,  -0.08172209,  -1.12, 9.3,   -0.10896278,
+    -2.37, 10.95, -0.8036005,   -4.06, 12.3,  -0.7627395,
+    -3.41, 14.52, -0.7218784,   0.34,  12.22, -3.7864566,
+    0.76,  15.32, -2.6014864,   -0.04, 13.53, -1.1849703,
+    -0.53, 9.72,  -2.1792557,   0.11,  5.52,  -1.6480621,
+    0.38,  4.06,  0.46309182,   0.04,  3.26,  0.14982383,
+    -3.34, 5.83,  4.862464,     -6.05, 13.14, 7.668256,
+    -0.91, 11.14, 11.073342,    -0.5,  16.13, -0.9125633,
+    -0.27, 7.7,   -1.1849703,   -3.45, 9.28,  -2.1383946,
+    -2.03, 9.04,  -0.53119355,  2.03,  6.89,  -0.5720546,
+    -2.18, 7.5,   -1.3756552,   -1.8,  7.21,  -0.0,
+    -1.57, 9.96,  0.08172209,   -3.21, 12.07, -0.14982383,
+    -5.09, 12.22, -0.7627395,   -2.68, 14.98, -3.173541,
+    1.99,  12.79, -3.2961242,   0.84,  14.82, -2.2609777,
+    0.69,  13.21, -2.2609777,   -1.08, 9.15,  -1.2258313,
+    -0.95, 4.9,   -0.7627395,   -0.11, 4.67,  0.19068487,
+    0.61,  3.49,  0.08172209,   -1.84, 5.48,  5.134871,
+    -5.6,  14.06, 7.3958488,    -1.08, 12.03, 8.308413,
+    1.73,  14.56, 2.9147544,    -0.76, 5.94,  -5.325556,
+    -5.6,  12.83, -0.0,         0.04,  6.66,  -0.9942854,
+    1.65,  7.89,  -0.6537767,   -2.3,  7.93,  -2.3426998,
+    -1.92, 8.24,  -0.040861044, -1.42, 9.96,  -0.14982383,
+    -3.72, 11.5,  0.14982383,   -4.59, 12.18, -0.5720546,
+    -2.79, 14.25, -3.2961242,   3.15,  13.02, -3.1054392,
+    1.46,  14.94, -2.2201166,   -2.22, 12.49, -2.1111538,
+    -1.42, 9.53,  -1.607201,    -0.11, 6.17,  -0.8036005,
+    0.34,  4.71,  0.10896278,   1.04,  3.49,  0.53119355,
+    -1.99, 5.05,  3.255263,     -6.66, 14.29, 7.082581,
+    -3.87, 10.04, 9.765789,     -1.5,  18.39, -0.6946377,
+    2.37,  5.01,  -0.5720546,   -5.24, 10.76, -3.173541,
+    -1.46, 8.2,   0.53119355,   2.6,   6.97,  -0.040861044,
+    -3.53, 8.85,  -1.879608,    -1.23, 7.06,  -0.23154591,
+    -1.53, 11.3,  0.23154591,   -2.53, 11.65, -0.6946377,
+    -3.83, 12.34, -0.50395286,  -2.96, 13.25, -3.173541,
+    2.83,  13.25, -3.173541,    0.65,  14.41, -1.1441092,
+    -0.89, 11.8,  -2.6014864,   -1.18, 7.21,  -1.334794};
+
+static float data_har_standing[270] = {
+    3.17,  9.28,  1.1441092,   3.3,   9.23,  1.1168685,
+    3.21,  9.3,   1.1849703,   3.17,  9.28,  1.0760075,
+    3.17,  9.34,  1.1168685,   3.26,  9.28,  1.1168685,
+    3.21,  9.3,   1.1168685,   3.21,  9.23,  1.1168685,
+    3.17,  9.28,  1.1168685,   3.15,  9.28,  1.1849703,
+    3.17,  9.34,  1.1168685,   3.21,  9.28,  1.1849703,
+    3.21,  9.3,   1.0760075,   3.15,  9.34,  1.1168685,
+    3.21,  9.28,  1.0760075,   3.21,  9.34,  1.1441092,
+    3.26,  9.3,   1.1441092,   3.17,  9.34,  1.1168685,
+    3.21,  9.3,   1.1168685,   3.21,  9.28,  1.1168685,
+    3.26,  9.28,  1.1849703,   3.17,  9.3,   1.1168685,
+    3.21,  9.28,  1.1168685,   -1.88, 9.85,  -0.23154591,
+    -0.19, 9.92,  -0.5720546,  -0.61, 10.27, -0.88532263,
+    0.76,  10.57, -1.7570249,  0.42,  9.47,  -1.1168685,
+    0.38,  9.47,  -1.9477097,  -1.04, 10.65, -1.525479,
+    -1.92, 9.51,  -0.5720546,  -1.31, 9.85,  -0.53119355,
+    -0.08, 9.92,  -1.7570249,  1.73,  9.77,  -0.8036005,
+    1.5,   9.92,  -1.4573772,  1.27,  10.5,  -1.879608,
+    0.61,  10.12, -1.9885708,  -0.53, 9.77,  -1.879608,
+    -0.42, 9.62,  -1.6480621,  0.65,  10.42, -2.2201166,
+    0.65,  10.42, -2.2201166,  1.61,  9.38,  -1.8387469,
+    1.61,  9.43,  -1.525479,   1.61,  9.43,  -1.525479,
+    0.95,  10.27, -1.3075534,  0.19,  10.38, -1.1849703,
+    0.31,  9.81,  -1.4165162,  1.12,  9.62,  -1.6889231,
+    1.23,  9.85,  -1.6480621,  1.04,  9.7,   -1.8387469,
+    0.57,  9.89,  -2.0294318,  0.65,  9.96,  -1.9885708,
+    0.95,  9.96,  -1.7570249,  1.42,  10,    -1.7297841,
+    1.69,  9.89,  -1.525479,   1.46,  10,    -1.4165162,
+    0.69,  9.77,  -1.6889231,  0.08,  9.96,  -1.9477097,
+    -0.08, 10.19, -2.1111538,  0.38,  9.72,  -1.9885708,
+    0.93,  10.12, -2.1111538,  1.33,  9.62,  -1.9885708,
+    1.08,  9.85,  -1.9477097,  0.8,   9.77,  -1.7570249,
+    0.69,  10.34, -1.6889231,  0.72,  9.66,  -1.3075534,
+    0.69,  10,    -1.3756552,  0.93,  9.62,  -1.4573772,
+    0.76,  10.12, -1.607201,   0.93,  9.72,  -1.7978859,
+    0.76,  10.23, -1.9885708,  0.76,  9.23,  -1.920469,
+    0.57,  10.34, -2.1383946,  0.99,  9.58,  -1.879608,
+    1.33,  10.04, -1.7978859,  1.61,  9.85,  -1.4165162,
+    0.61,  10.15, -0.88532263, 0.53,  9.58,  -1.4573772,
+    0.15,  10.19, -1.920469,   0.34,  9.85,  -1.334794,
+    0.8,   10.31, -1.7978859,  0.69,  9.53,  -1.9477097,
+    0.8,   9.92,  -1.879608,   0.5,   10.04, -1.1849703,
+    1.12,  9.43,  -1.7978859,  1.31,  10.27, -1.2666923,
+    1.5,   9.77,  -1.607201,   0.46,  10.04, -0.9125633,
+    0.31,  9.85,  -1.0760075,  0.61,  10.19, -1.1849703};
+
+#endif  // MICRO_EXAMPLES_DATA_HAR_H_
--- a/micro/examples/classifier/data/kws.h
+++ b/micro/examples/classifier/data/kws.h
--- a/micro/examples/classifier/data/mnist.h
+++ b/micro/examples/classifier/data/mnist.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_EXAMPLES_DATA_MNIST_H_
+#define MICRO_EXAMPLES_DATA_MNIST_H_
+
+// clang-format off
+static float data_mnist_4[28*28] = {
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.4,0.1,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.8,0.4,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.2,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0.2,1. ,0.5,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.6,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0.6,1. ,0.9,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.7,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0.1,0.8,1. ,0.6,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,1. ,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0.1,1. ,1. ,0.5,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,1. ,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0.2,1. ,0.7,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.3,1. ,0.8,0.1,0. ,0. ,0. ,0. ,0. ,0.6,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.4,1. ,0.8,0. ,0. ,0. ,0. ,0. ,0. ,0.7,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.4,1. ,0.8,0.1,0. ,0. ,0. ,0. ,0.2,1. ,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.3,1. ,1. ,0.9,0.7,0.5,0.6,0.2,0.6,1. ,1. ,0.6,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.5,1. ,1. ,1. ,1. ,1. ,1. ,1. ,1. ,1. ,1. ,0.8,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.1,0.5,0.5,0.5,0.5,0.5,0.9,1. ,0.6,0.1,0.1,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.9,0.9,0.1,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.2,1. ,0.9,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.2,0.9,1. ,0.3,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.3,1. ,0.9,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.8,1. ,0.2,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.1,0.8,1. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.2,1. ,1. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.1,0.9,1. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.,
+0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0. ,0.
+};
+// clang-format on
+
+#endif  // MICRO_EXAMPLES_DATA_MNIST_H_
--- a/micro/examples/classifier/main.cc
+++ b/micro/examples/classifier/main.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdio>
+
+#include "data.h"
+#include "micro.h"
+
+namespace micro {
+namespace MICRO_MODEL_NAME {
+
+MaceStatus GetMicroEngineSingleton(MaceMicroEngine **engine);
+
+}
+}  // namespace micro
+
+int main() {
+  micro::MaceMicroEngine *micro_engine = NULL;
+  micro::MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine);
+
+  micro_engine->RegisterInputData(0, MICRO_DATA_NAME::input,
+                                  MICRO_DATA_NAME::input_dims);
+  micro_engine->Run();
+
+  float *output_buffer = NULL;
+  const int32_t *output_dims = NULL;
+  uint32_t dim_size = 0;
+  micro_engine->GetOutputData(0, reinterpret_cast<void **>(&output_buffer),
+                              &output_dims, &dim_size);
+
+  int32_t output_total_size = 1;
+  for (int32_t i = 0; i < dim_size; ++i) {
+    output_total_size *= output_dims[i];
+  }
+
+  for (int32_t i = 0; i < output_total_size; ++i) {
+    printf("%d: %f\n", i, output_buffer[i]);
+  }
+
+  return 0;
+}
--- a/micro/examples/classifier/mbed-os.lib
+++ b/micro/examples/classifier/mbed-os.lib
+https://github.com/ARMmbed/mbed-os/#0db72d0cf26539016efbe38f80d6f2cb7a3d4414
--- a/micro/framework/operator.cc
+++ b/micro/framework/operator.cc
@@ -201,6 +201,39 @@ MaceStatus Operator::ResizeOutputShape(uint32_t idx, uint32_t dim_size,
  return MACE_SUCCESS;
 }

+QuantizeInfo Operator::GetInputQuantizeInfo(uint32_t idx) {
+  const OpIOInfo *input_info = op_context_->input_info(idx);
+  const uint32_t op_def_idx = input_info->op_def_idx_;
+  if (kIdxConstTensor == op_def_idx) {
+    const model::ConstTensor *const_tensor =
+        engine_config_->net_def_->tensor(input_info->output_idx_);
+    QuantizeInfo quantize_info;
+    quantize_info.scale = const_tensor->scale();
+    quantize_info.zero = const_tensor->zero_point();
+    return quantize_info;
+  } else if (kIdxModelInput == op_def_idx) {
+    MACE_ASSERT1(false, "Unexpected, the model input has no quantize info");
+  } else {
+    const model::OperatorDef *pre_op_def =
+        engine_config_->net_def_->op(op_def_idx);
+    model::QuantizeActivationInfo quantize_activation_info =
+        pre_op_def->quantize_info(input_info->output_idx_);
+    QuantizeInfo quantize_info;
+    quantize_info.scale = quantize_activation_info.scale();
+    quantize_info.zero = quantize_activation_info.zero_point();
+    return quantize_info;
+  }
+}
+
+QuantizeInfo Operator::GetOutputQuantizeInfo(uint32_t idx) {
+  QuantizeInfo quantize_info;
+  model::QuantizeActivationInfo quantize_activation_info =
+      op_def_->quantize_info(idx);
+  quantize_info.scale = quantize_activation_info.scale();
+  quantize_info.zero = quantize_activation_info.zero_point();
+  return quantize_info;
+}
+
 #ifndef MACE_DEFINE_GET_ARG_BY_NAME_FUNC
 #define MACE_DEFINE_GET_ARG_BY_NAME_FUNC(T, FUNC)                   \
 template <>                                                         \

--- a/micro/framework/operator.h
+++ b/micro/framework/operator.h
@@ -17,13 +17,18 @@

 #include "micro/base/logging.h"
 #include "micro/base/types.h"
-#include "micro/include/public/micro.h"
 #include "micro/framework/scratch_buffer.h"
+#include "micro/include/public/micro.h"

 namespace micro {

 struct MaceMicroEngineConfig;

+struct QuantizeInfo {
+  float scale;
+  float zero;
+};
+
 namespace model {
 class Argument;
 class OperatorDef;
@@ -84,6 +89,10 @@ class Operator {
                               const int32_t *input_dims);
  MaceStatus ReuseInputBufferForOutput(uint32_t output_idx, uint32_t input_idx);

+  QuantizeInfo GetInputQuantizeInfo(uint32_t idx);
+
+  QuantizeInfo GetOutputQuantizeInfo(uint32_t idx);
+
  template<typename T>
  const T *GetInputData(uint32_t idx) {
    return static_cast<const T *>(DoGetInputData(idx));
@@ -101,7 +110,7 @@ class Operator {
  const model::OperatorDef *op_def_;
  MaceMicroEngineConfig *engine_config_;

- private:
+ protected:
  OpContext *op_context_;
 };


--- a/micro/include/public/micro.h
+++ b/micro/include/public/micro.h
@@ -17,7 +17,9 @@

 #include <stdint.h>

-#include "micro/include/port/define.h"
+#ifndef NULL
+#define NULL 0
+#endif

 namespace micro {

@@ -61,7 +63,7 @@ class Graph;
 class Operator;
 }  // namespace framework

-struct MACE_API MaceMicroEngineConfig {
+struct MaceMicroEngineConfig {
  model::NetDef *net_def_;
  const uint8_t *model_data_;
  framework::Graph *graph_;
@@ -73,7 +75,7 @@ struct MACE_API MaceMicroEngineConfig {
  uint32_t scratch_buffer_size_;
 };

-class MACE_API MaceMicroEngine {
+class MaceMicroEngine {
 public:
  MaceMicroEngine() {}
  ~MaceMicroEngine() {}

--- a/micro/include/utils/bfloat16.h
+++ b/micro/include/utils/bfloat16.h
@@ -32,7 +32,14 @@ union Sphinx {

 class BFloat16 {
 public:
-  BFloat16();
+  BFloat16() {}
+
+  explicit BFloat16(float value) { data_ = Sphinx(value).i >> 16; }
+
+  explicit BFloat16(int value) {
+    data_ = Sphinx(static_cast<float>(value)).i >> 16;
+  }
+

  operator float() const {
    return Sphinx(static_cast<uint32_t>(data_ << 16)).f;

--- a/micro/model/operator_def.cc
+++ b/micro/model/operator_def.cc
@@ -23,11 +23,22 @@ MACE_DEFINE_STRING_FUNC(OperatorDef, name, name_)
 MACE_DEFINE_STRING_FUNC(OperatorDef, type, type_)
 MACE_DEFINE_OBJECT_FUNC(OperatorDef, int32_t, device_type)
 MACE_DEFINE_PTR_ARRAY_FUNC(OperatorDef, Argument, arg, args_)
-MACE_DEFINE_PTR_ARRAY_FUNC(OperatorDef, OutputShape,
-                           output_shape, output_shapes_)
+MACE_DEFINE_PTR_ARRAY_FUNC(OperatorDef,
+                           OutputShape,
+                           output_shape,
+                           output_shapes_)
 MACE_DEFINE_ARRAY_FUNC(OperatorDef, DataType, output_type, output_types_)
+MACE_DEFINE_ARRAY_FUNC(OperatorDef,
+                       QuantizeActivationInfo,
+                       quantize_info,
+                       quantize_info_);
 // the mem_offset is the mem_id in proto file
 MACE_DEFINE_ARRAY_FUNC(OperatorDef, int32_t, mem_offset, mem_offsets_)

+MACE_DEFINE_OBJECT_FUNC(QuantizeActivationInfo, float, scale);
+MACE_DEFINE_OBJECT_FUNC(QuantizeActivationInfo, int32_t, zero_point);
+MACE_DEFINE_OBJECT_FUNC(QuantizeActivationInfo, float, minval);
+MACE_DEFINE_OBJECT_FUNC(QuantizeActivationInfo, float, maxval);
+
 }  // namespace model
 }  // namespace micro
--- a/micro/model/operator_def.h
+++ b/micro/model/operator_def.h
@@ -23,6 +23,22 @@
 namespace micro {
 namespace model {

+class QuantizeActivationInfo {
+ public:
+  MACE_DEFINE_HARD_CODE_MAGIC(QuantizeActivationInfo)
+
+  MACE_DECLARE_OBJECT_FUNC(float, scale);
+  MACE_DECLARE_OBJECT_FUNC(int32_t, zero_point);
+  MACE_DECLARE_OBJECT_FUNC(float, minval);
+  MACE_DECLARE_OBJECT_FUNC(float, maxval);
+
+ private:
+  SerialFloat scale_;
+  SerialInt32 zero_point_;
+  SerialFloat minval_;
+  SerialFloat maxval_;
+};
+
 class OperatorDef : public Serialize {
 public:
  MACE_DEFINE_HARD_CODE_MAGIC(OperatorDef)
@@ -35,6 +51,7 @@ class OperatorDef : public Serialize {
  MACE_DECLARE_PTR_ARRAY_FUNC(Argument, arg);
  MACE_DECLARE_PTR_ARRAY_FUNC(OutputShape, output_shape);
  MACE_DECLARE_ARRAY_FUNC(DataType, output_type);
+  MACE_DECLARE_ARRAY_FUNC(QuantizeActivationInfo, quantize_info);
  // the mem_offset is the mem_id in proto file
  MACE_DECLARE_ARRAY_FUNC(int32_t, mem_offset);

@@ -48,6 +65,7 @@ class OperatorDef : public Serialize {
  SerialArray<Argument> args_;
  SerialArray<OutputShape> output_shapes_;
  SerialArray<DataType> output_types_;
+  SerialArray<QuantizeActivationInfo> quantize_info_;
  SerialArray<SerialInt32> mem_offsets_;
 };


--- a/micro/ops/CMakeLists.txt
+++ b/micro/ops/CMakeLists.txt
 set(MICRO_OPS_SRCS
  shape.cc
  reduce.cc
-  reshape.cc
  matmul.cc
  nhwc/depthwise_conv_2d_ref.cc
  nhwc/conv_2d_c4_s4.cc
@@ -31,6 +30,8 @@ set(MICRO_OPS_SRCS
  activation.cc
 )

+add_subdirectory(nhwc)
+
 add_library(micro_ops
  ${MICRO_OPS_SRCS}
 )

--- a/micro/ops/matmul.h
+++ b/micro/ops/matmul.h
@@ -40,10 +40,8 @@ class MatMulOp : public framework::Operator {
  uint32_t input_b_dim_size_;

  const mifloat *bias_;
-#ifndef MACE_MICRO_NDEBUG
  const int32_t *bias_dims_;
  uint32_t bias_dim_size_;
-#endif

  mifloat *output_;


--- a/micro/ops/nhwc/CMakeLists.txt
+++ b/micro/ops/nhwc/CMakeLists.txt
+if(MACE_MICRO_ENABLE_CMISI)
+  add_subdirectory(cmsis_nn)
+endif()
--- a/micro/ops/nhwc/cmsis_nn/CMakeLists.txt
+++ b/micro/ops/nhwc/cmsis_nn/CMakeLists.txt
+add_library(mace_micro_ops_nhwc_cmsis_nn
+  arm_conv_2d_int8.cc
+  arm_pooling_int8.cc
+  arm_softmax_int8.cc
+  arm_mat_mul_int8.cc
+  arm_depthwise_conv_2d_int8.cc
+  dequantize.cc
+  quantize.cc
+  utilities.cc
+)
+
+target_link_libraries(mace_micro_ops_nhwc_cmsis_nn
+  PUBLIC micro_base
+  PUBLIC micro_framework
+  PUBLIC micro_ops
+  PRIVATE CMSISNN
+)
--- a/micro/ops/nhwc/cmsis_nn/arm_conv_2d_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_conv_2d_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_conv_2d_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logger.h"
+#include "micro/framework/op_context.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/net_def.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus ArmConv2dInt8Op::Compute(int32_t (&output_dims)[4]) {
+  MACE_ASSERT(filter_dims_[0] == output_dims[3] &&
+              input_dims_[3] == filter_dims_[3]);
+
+  QuantizeInfo input_quantize_info = GetInputQuantizeInfo(INPUT);
+  QuantizeInfo filter_quantize_info = GetInputQuantizeInfo(FILTER);
+  QuantizeInfo output_quantize_info = GetOutputQuantizeInfo(OUTPUT);
+
+  double double_multiplier = input_quantize_info.scale *
+                             filter_quantize_info.scale /
+                             output_quantize_info.scale;
+  int32_t multiplier;
+  int32_t shift;
+  QuantizeMultiplier(double_multiplier, &multiplier, &shift);
+
+  cmsis_nn_conv_params conv_params;
+  /// input_offset is negative
+  conv_params.input_offset = -input_quantize_info.zero;
+  conv_params.output_offset = output_quantize_info.zero;
+  conv_params.activation.min = -128;
+  conv_params.activation.max = 127;
+  conv_params.stride.w = strides_[0];
+  conv_params.stride.h = strides_[1];
+  conv_params.padding.w = padding_sizes_[0] / 2;
+  conv_params.padding.h = padding_sizes_[1] / 2;
+  conv_params.dilation.w = dilations_[0];
+  conv_params.dilation.h = dilations_[1];
+
+  ScratchBuffer scratch_buffer(engine_config_);
+
+  cmsis_nn_per_channel_quant_params quant_params;
+  quant_params.multiplier = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+  quant_params.shift = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+  for (int32_t i = 0; i < output_dims[3]; ++i) {
+    quant_params.multiplier[i] = multiplier;
+    quant_params.shift[i] = shift;
+  }
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_dims_[0];
+  input_dims.h = input_dims_[1];
+  input_dims.w = input_dims_[2];
+  input_dims.c = input_dims_[3];
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(input_);
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = filter_dims_[0];
+  filter_dims.h = filter_dims_[1];
+  filter_dims.w = filter_dims_[2];
+  filter_dims.c = filter_dims_[3];
+  const int8_t *filter_data = reinterpret_cast<const int8_t *>(filter_);
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_dims[3];
+  int32_t *bias_data =
+      const_cast<int32_t *>(reinterpret_cast<const int32_t *>(bias_));
+  if (bias_data == NULL) {
+    bias_data = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+    for (int32_t i = 0; i < bias_dims.c; ++i) {
+      bias_data[i] = 0;
+    }
+  }
+
+  cmsis_nn_dims cmn_output_dims;
+  cmn_output_dims.n = output_dims[0];
+  cmn_output_dims.h = output_dims[1];
+  cmn_output_dims.w = output_dims[2];
+  cmn_output_dims.c = output_dims[3];
+  int8_t *output_data = reinterpret_cast<int8_t *>(output_);
+
+  cmsis_nn_context cmn_context;
+  cmn_context.size = arm_convolve_wrapper_s8_get_buffer_size(
+      &conv_params, &input_dims, &filter_dims, &cmn_output_dims);
+  if (cmn_context.size > 0) {
+    cmn_context.buf = scratch_buffer.GetBuffer<int8_t>(cmn_context.size);
+  } else {
+    cmn_context.buf = NULL;
+  }
+
+  arm_status status = arm_convolve_wrapper_s8(
+      &cmn_context, &conv_params, &quant_params, &input_dims, input_data,
+      &filter_dims, filter_data, &bias_dims, bias_data, &cmn_output_dims,
+      output_data);
+  MACE_ASSERT(status == ARM_MATH_SUCCESS)
+      << "failed in arm_convolve_wrapper_s8";
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus ArmConv2dInt8Op::Run() {
+  int32_t output_dims[4] = {0};
+  InitPaddingAndOutputSize(input_dims_, filter_dims_, FLOOR, output_dims);
+  ResizeOutputShape(0, 4, output_dims);
+
+  MACE_RETURN_IF_ERROR(Compute(output_dims));
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_conv_2d_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_conv_2d_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_CONV_2D_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_CONV_2D_INT8_H_
+
+#include "micro/ops/nhwc/base/conv_2d_base.h"
+#include "micro/ops/utils/activation.h"
+
+namespace micro {
+namespace ops {
+
+class ArmConv2dInt8Op : public Conv2dBase {
+ public:
+  virtual MaceStatus Run();
+
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_CONV_2D_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/arm_depthwise_conv_2d_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_depthwise_conv_2d_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_depthwise_conv_2d_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logger.h"
+#include "micro/framework/op_context.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/net_def.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus ArmDepthwiseConv2dInt8Op::Compute(int32_t (&output_dims)[4]) {
+  QuantizeInfo input_quantize_info = GetInputQuantizeInfo(INPUT);
+  QuantizeInfo filter_quantize_info = GetInputQuantizeInfo(FILTER);
+  QuantizeInfo output_quantize_info = GetOutputQuantizeInfo(OUTPUT);
+
+  double double_multiplier = input_quantize_info.scale *
+                             filter_quantize_info.scale /
+                             output_quantize_info.scale;
+  int32_t multiplier;
+  int32_t shift;
+  QuantizeMultiplier(double_multiplier, &multiplier, &shift);
+
+  cmsis_nn_dw_conv_params dw_conv_params;
+  dw_conv_params.ch_mult = filter_dims_[0];
+  /// input_offset is negative
+  dw_conv_params.input_offset = -input_quantize_info.zero;
+  dw_conv_params.output_offset = output_quantize_info.zero;
+  dw_conv_params.activation.min = -128;
+  dw_conv_params.activation.max = 127;
+  dw_conv_params.stride.w = strides_[0];
+  dw_conv_params.stride.h = strides_[1];
+  dw_conv_params.padding.w = padding_sizes_[0] / 2;
+  dw_conv_params.padding.h = padding_sizes_[1] / 2;
+  dw_conv_params.dilation.w = dilations_[0];
+  dw_conv_params.dilation.h = dilations_[1];
+
+  ScratchBuffer scratch_buffer(engine_config_);
+
+  cmsis_nn_per_channel_quant_params quant_params;
+  quant_params.multiplier = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+  quant_params.shift = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+  for (int32_t i = 0; i < output_dims[3]; ++i) {
+    quant_params.multiplier[i] = multiplier;
+    quant_params.shift[i] = shift;
+  }
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_dims_[0];
+  input_dims.h = input_dims_[1];
+  input_dims.w = input_dims_[2];
+  input_dims.c = input_dims_[3];
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(input_);
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = filter_dims_[0];
+  filter_dims.h = filter_dims_[1];
+  filter_dims.w = filter_dims_[2];
+  filter_dims.c = filter_dims_[3];
+  const int8_t *filter_data = reinterpret_cast<const int8_t *>(filter_);
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_dims[3];
+  int32_t *bias_data =
+      const_cast<int32_t *>(reinterpret_cast<const int32_t *>(bias_));
+  if (bias_data == NULL) {
+    bias_data = scratch_buffer.GetBuffer<int32_t>(output_dims[3]);
+    for (int32_t i = 0; i < bias_dims.c; ++i) {
+      bias_data[i] = 0;
+    }
+  }
+
+  cmsis_nn_dims cmn_output_dims;
+  cmn_output_dims.n = output_dims[0];
+  cmn_output_dims.h = output_dims[1];
+  cmn_output_dims.w = output_dims[2];
+  cmn_output_dims.c = filter_dims.c * filter_dims.n;
+  int8_t *output_data = reinterpret_cast<int8_t *>(output_);
+
+  cmsis_nn_context cmn_context;
+  cmn_context.size = arm_depthwise_conv_wrapper_s8_get_buffer_size(
+      &dw_conv_params, &input_dims, &filter_dims, &cmn_output_dims);
+
+  if (cmn_context.size > 0) {
+    cmn_context.buf = scratch_buffer.GetBuffer<int8_t>(cmn_context.size);
+  } else {
+    cmn_context.buf = NULL;
+  }
+
+  arm_status status = arm_depthwise_conv_wrapper_s8(
+      &cmn_context, &dw_conv_params, &quant_params, &input_dims, input_data,
+      &filter_dims, filter_data, &bias_dims, bias_data, &cmn_output_dims,
+      output_data);
+  MACE_ASSERT(status == ARM_MATH_SUCCESS)
+      << "failed in arm_convolve_wrapper_s8";
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus ArmDepthwiseConv2dInt8Op::Run() {
+  int32_t output_dims[4] = {0};
+  InitPaddingAndOutputSize(input_dims_, filter_dims_, FLOOR, output_dims);
+  output_dims[3] *= input_dims_[3];
+  ResizeOutputShape(0, 4, output_dims);
+
+  MACE_RETURN_IF_ERROR(Compute(output_dims));
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_depthwise_conv_2d_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_depthwise_conv_2d_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_DEPTHWISE_CONV_2D_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_DEPTHWISE_CONV_2D_INT8_H_
+
+#include "micro/ops/nhwc/base/depthwise_conv_2d_base.h"
+#include "micro/ops/utils/activation.h"
+
+namespace micro {
+namespace ops {
+
+class ArmDepthwiseConv2dInt8Op : public DepthwiseConv2dBase {
+ public:
+  virtual MaceStatus Run();
+
+ private:
+  MaceStatus Compute(int32_t (&output_dims)[4]);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_DEPTHWISE_CONV_2D_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/arm_eltwise_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_eltwise_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_eltwise_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus ArmEltwiseInt8Op::OnInit() {
+  input0_ = GetInputData<int8_t>(INPUT0);
+  input0_dims_ = GetInputShapeDims(INPUT0);
+  input0_dim_size_ = GetInputShapeDimSize(INPUT0);
+
+  if (GetInputSize() >= 2) {
+    input1_ = GetInputData<int8_t>(INPUT1);
+    input1_dims_ = GetInputShapeDims(INPUT1);
+    input1_dim_size_ = GetInputShapeDimSize(INPUT1);
+  } else {
+    input1_ = NULL;
+    input1_dims_ = NULL;
+    input1_dim_size_ = 0;
+  }
+
+  output_ = GetOutputData<int8_t>(OUTPUT);
+
+  type_ = static_cast<eltwise::Type>(
+      GetArgByName("type", static_cast<int32_t>(NONE)));
+  coeff_ = GetRepeatArgByName<float>("coeff", &coeff_size_);
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus ArmEltwiseInt8Op::Run() {
+  MACE_ASSERT1(GetInputSize() < 3,
+               "Element-Wise does not support 3 or higher inputs,"
+               " you could change your model to multiple Element-Wise");
+
+  if (type_ == 0) {
+    QuantizeInfo input_quantize_info0 = GetInputQuantizeInfo(0);
+    QuantizeInfo input_quantize_info1 = GetInputQuantizeInfo(1);
+    QuantizeInfo output_quantize_info = GetOutputQuantizeInfo(OUTPUT);
+
+    int32_t input0_offset = -input_quantize_info0.zero;
+    double input0_scale = input_quantize_info0.scale;
+    int32_t input1_offset = -input_quantize_info1.zero;
+    double input1_scale = input_quantize_info1.scale;
+    int32_t output_offset = output_quantize_info.zero;
+    double output_scale = output_quantize_info.scale;
+
+    int32_t left_shift = 20;
+
+    const double twice_max_input_scale =
+        2 * static_cast<double>(base::max(input0_scale, input1_scale));
+    const double real_input0_multiplier =
+        static_cast<double>(input0_scale) / twice_max_input_scale;
+    const double real_input1_multiplier =
+        static_cast<double>(input1_scale) / twice_max_input_scale;
+    const double real_output_multiplier =
+        twice_max_input_scale /
+        ((1 << left_shift) * static_cast<double>(output_scale));
+
+    int32_t input0_multiplier = 0;
+    int32_t input0_shift = 0;
+    QuantizeMultiplier(real_input0_multiplier, &input0_multiplier,
+                       &input0_shift);
+
+    int32_t input1_multiplier = 0;
+    int32_t input1_shift = 0;
+    QuantizeMultiplier(real_input1_multiplier, &input1_multiplier,
+                       &input1_shift);
+
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    QuantizeMultiplier(real_output_multiplier, &output_multiplier,
+                       &output_shift);
+
+    int32_t element_size = base::GetShapeSize(input0_dim_size_, input0_dims_);
+    arm_elementwise_add_s8(input0_, input1_, input0_offset, input0_multiplier,
+                           input0_shift, input1_offset, input1_multiplier,
+                           input1_shift, left_shift, output_, output_offset,
+                           output_multiplier, output_shift, -128, 127,
+                           element_size);
+  } else {
+    MACE_ASSERT1(false, "Unsupported ArmEltwiseInt8Op type");
+  }
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_eltwise_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_eltwise_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_ELTWISE_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_ELTWISE_INT8_H_
+
+#include "micro/base/logger.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/op_context.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/net_def.h"
+
+namespace micro {
+namespace ops {
+
+namespace eltwise {  // for redefine
+
+enum Type {
+  SUM = 0,
+  SUB = 1,
+  PROD = 2,
+  DIV = 3,
+  MIN = 4,
+  MAX = 5,
+  NEG = 6,
+  ABS = 7,
+  SQR_DIFF = 8,
+  POW = 9,
+  EQUAL = 10,
+  FLOOR_DIV = 11,
+  CLIP = 12,
+  SIGN = 13,
+  NONE = 14,
+};
+
+}  // namespace eltwise
+
+class ArmEltwiseInt8Op : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+
+  MaceStatus Run();
+
+ private:
+  const int8_t *input0_;
+  const int32_t *input0_dims_;
+  uint32_t input0_dim_size_;
+
+  const int8_t *input1_;
+  const int32_t *input1_dims_;
+  uint32_t input1_dim_size_;
+
+  int8_t *output_;
+
+  eltwise::Type type_;
+  const float *coeff_;
+  uint32_t coeff_size_;
+  int32_t scalar_input_index_;
+  bool nchw_;
+
+  MACE_OP_INPUT_TAGS(INPUT0, INPUT1);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_ELTWISE_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/arm_mat_mul_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_mat_mul_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_mat_mul_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logger.h"
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/op_context.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/model/argument.h"
+#include "micro/model/const_tensor.h"
+#include "micro/model/net_def.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus ArmMatMulInt8Op::OnInit() {
+  transpose_a_ = GetArgByName("transpose_a", false);
+  transpose_b_ = GetArgByName("transpose_b", false);
+  input_a_ = GetInputData<int8_t>(INPUT_A);
+  input_b_ = GetInputData<int8_t>(INPUT_B);
+  output_ = GetOutputData<int8_t>(OUTPUT);
+
+  if (GetInputSize() >= 3) {
+    bias_ = GetInputData<int32_t>(BIAS);
+    bias_dim_size_ = GetInputShapeDimSize(BIAS);
+    bias_dims_ = GetInputShapeDims(BIAS);
+  } else {
+    bias_ = NULL;
+    bias_dim_size_ = 0;
+    bias_dims_ = NULL;
+  }
+
+  input_a_dim_size_ = GetInputShapeDimSize(INPUT_A);
+  input_b_dim_size_ = GetInputShapeDimSize(INPUT_B);
+
+  input_a_dims_ = GetInputShapeDims(INPUT_A);
+  input_b_dims_ = GetInputShapeDims(INPUT_B);
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus ArmMatMulInt8Op::Run() {
+  MACE_ASSERT(Validate());
+
+  MACE_ASSERT(input_a_dim_size_ == 2);
+  MACE_ASSERT(input_b_dim_size_ == 2);
+
+  MACE_ASSERT(transpose_b_);
+  MACE_ASSERT(!transpose_a_);
+
+  const int32_t lhs_rows = input_a_dims_[0];
+  const int32_t rhs_rows = input_b_dims_[0];
+  const int32_t rhs_cols = input_b_dims_[1];
+
+  const int32_t rhs_t_cols = rhs_rows;
+
+  const int32_t rows = lhs_rows;
+  const int32_t cols = rhs_t_cols;
+
+  if (bias_ != NULL) {
+    MACE_ASSERT(bias_dim_size_ == 1);
+    MACE_ASSERT(bias_dims_[0] == cols);
+  }
+
+  int32_t *output_dims0 =
+      ScratchBuffer(engine_config_).GetBuffer<int32_t>(input_a_dim_size_);
+
+  output_dims0[0] = rows;
+  output_dims0[1] = cols;
+
+  MACE_RETURN_IF_ERROR(
+      ResizeOutputShape(OUTPUT, input_a_dim_size_, output_dims0));
+
+  QuantizeInfo input_quantize_info_a = GetInputQuantizeInfo(INPUT_A);
+  QuantizeInfo input_quantize_info_b = GetInputQuantizeInfo(INPUT_B);
+  QuantizeInfo output_quantize_info = GetOutputQuantizeInfo(OUTPUT);
+
+  double double_multiplier = input_quantize_info_a.scale *
+                             input_quantize_info_b.scale /
+                             output_quantize_info.scale;
+  int32_t multiplier;
+  int32_t shift;
+  QuantizeMultiplier(double_multiplier, &multiplier, &shift);
+
+  ScratchBuffer scratch_buffer(engine_config_);
+
+  int32_t *bias = NULL;
+  if (bias_ == NULL) {
+    bias = scratch_buffer.GetBuffer<int32_t>(cols);
+    for (int32_t i = 0; i < cols; ++i) {
+      bias[i] = 0;
+    }
+  } else {
+    bias = const_cast<int32_t *>(bias_);
+  }
+
+  arm_status status = arm_nn_vec_mat_mult_t_s8(
+      input_a_, input_b_, bias, output_, -input_quantize_info_a.zero, 0,
+      output_quantize_info.zero, multiplier, shift, rhs_cols, rhs_rows, -128,
+      127);
+
+  MACE_ASSERT(status == ARM_MATH_SUCCESS);
+
+  return MACE_SUCCESS;
+}
+
+bool ArmMatMulInt8Op::Validate() {
+  const int32_t lhs_rank = input_a_dim_size_;
+  const int32_t rhs_rank = input_b_dim_size_;
+  if (input_a_dim_size_ == input_b_dim_size_) {
+    for (uint32_t i = 0; i < input_a_dim_size_ - 2; ++i) {
+      MACE_ASSERT1(input_a_dims_[i] == input_b_dims_[i],
+                   "batch dimensions are not equal");
+    }
+  } else {
+    MACE_ASSERT1(input_a_dim_size_ == 2 || input_b_dim_size_ == 2,
+                 "Either lhs or rhs matrix should has rank 2 "
+                 "for non-batched matrix multiplication");
+  }
+
+  int32_t lhs_depth =
+      transpose_a_ ? input_a_dims_[lhs_rank - 2] : input_a_dims_[lhs_rank - 1];
+  int32_t rhs_depth =
+      transpose_b_ ? input_b_dims_[rhs_rank - 1] : input_b_dims_[rhs_rank - 2];
+  if (lhs_depth != rhs_depth) {
+    MACE_ASSERT1(false, "the number of A's column must be equal to B's row ");
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_mat_mul_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_mat_mul_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_MAT_MUL_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_MAT_MUL_INT8_H_
+
+#include "micro/framework/operator.h"
+
+namespace micro {
+namespace ops {
+class ArmMatMulInt8Op : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+
+ private:
+  bool Validate();
+
+ private:
+  const int8_t *input_a_;
+  const int32_t *input_a_dims_;
+  uint32_t input_a_dim_size_;
+
+  const int8_t *input_b_;
+  const int32_t *input_b_dims_;
+  uint32_t input_b_dim_size_;
+
+  const int32_t *bias_;
+  const int32_t *bias_dims_;
+  uint32_t bias_dim_size_;
+
+  int8_t *output_;
+
+  bool transpose_a_;
+  bool transpose_b_;
+
+  MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_MAT_MUL_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/arm_pooling_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_pooling_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_pooling_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/scratch_buffer.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+#include "micro/include/utils/macros.h"
+
+namespace micro {
+namespace ops {
+
+void ArmPoolingInt8Op::MaxPooling(const mifloat *input,
+                                  const int32_t *filter_hw,
+                                  const int32_t *stride_hw,
+                                  const int32_t *dilation_hw,
+                                  const int32_t *pad_hw) {
+  MACE_UNUSED(filter_hw);
+  MACE_UNUSED(dilation_hw);
+
+  cmsis_nn_context ctx;
+  ctx.buf = NULL;
+  ctx.size = 0;
+
+  cmsis_nn_pool_params pool_params;
+  pool_params.activation.min = -128;
+  pool_params.activation.max = 127;
+  pool_params.stride.h = stride_hw[0];
+  pool_params.stride.w = stride_hw[1];
+  pool_params.padding.h = pad_hw[0];
+  pool_params.padding.w = pad_hw[1];
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_dims_[0];
+  input_dims.h = input_dims_[1];
+  input_dims.w = input_dims_[2];
+  input_dims.c = input_dims_[3];
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(input);
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = filter_dims_[0];
+  filter_dims.h = filter_dims_[1];
+  filter_dims.w = filter_dims_[2];
+  filter_dims.c = filter_dims_[3];
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = output_dims_[0];
+  output_dims.h = output_dims_[1];
+  output_dims.w = output_dims_[2];
+  output_dims.c = output_dims_[3];
+  int8_t *output_data = reinterpret_cast<int8_t *>(output_);
+
+  arm_max_pool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims,
+                  &output_dims, output_data);
+}
+
+void ArmPoolingInt8Op::AvgPooling(const mifloat *input,
+                                  const int32_t *filter_hw,
+                                  const int32_t *stride_hw,
+                                  const int32_t *dilation_hw,
+                                  const int32_t *pad_hw) {
+  MACE_UNUSED(filter_hw);
+  MACE_UNUSED(dilation_hw);
+
+  const int32_t out_width = output_dims_[2];
+  const int32_t in_channels = input_dims_[3];
+
+  cmsis_nn_context ctx;
+  ctx.size = arm_avgpool_s8_get_buffer_size(out_width, in_channels);
+  MACE_ASSERT(ctx.size == 0);
+  ctx.buf = NULL;
+
+  cmsis_nn_pool_params pool_params;
+  pool_params.activation.min = -128;
+  pool_params.activation.max = 127;
+  pool_params.stride.h = stride_hw[0];
+  pool_params.stride.w = stride_hw[1];
+  pool_params.padding.h = pad_hw[0];
+  pool_params.padding.w = pad_hw[1];
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_dims_[0];
+  input_dims.h = input_dims_[1];
+  input_dims.w = input_dims_[2];
+  input_dims.c = input_dims_[3];
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(input);
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = filter_dims_[0];
+  filter_dims.h = filter_dims_[1];
+  filter_dims.w = filter_dims_[2];
+  filter_dims.c = filter_dims_[3];
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = output_dims_[0];
+  output_dims.h = output_dims_[1];
+  output_dims.w = output_dims_[2];
+  output_dims.c = output_dims_[3];
+  int8_t *output_data = reinterpret_cast<int8_t *>(output_);
+
+  arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims,
+                 &output_dims, output_data);
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_pooling_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_pooling_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_POOLING_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_POOLING_INT8_H_
+
+#include "micro/model/output_shape.h"
+#include "micro/ops/nhwc/base/pooling_base.h"
+
+namespace micro {
+namespace ops {
+
+class ArmPoolingInt8Op : public PoolingBase {
+ private:
+  void MaxPooling(const mifloat *input,
+                  const int32_t *filter_hw,
+                  const int32_t *stride_hw,
+                  const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+  void AvgPooling(const mifloat *input,
+                  const int32_t *filter_hw,
+                  const int32_t *stride_hw,
+                  const int32_t *dilation_hw,
+                  const int32_t *pad_hw);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_POOLING_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/arm_softmax_int8.cc
+++ b/micro/ops/nhwc/cmsis_nn/arm_softmax_int8.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/arm_softmax_int8.h"
+
+#include <arm_nnfunctions.h>
+
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/op_context.h"
+#include "micro/model/net_def.h"
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus ArmSoftmaxInt8Op::OnInit() {
+  data_format_ = static_cast<DataFormat>(
+      GetArgByName("data_format", static_cast<int32_t>(NHWC)));
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+  MACE_ASSERT(input_dim_size_ == 2);
+
+  output_ = GetOutputData<mifloat>(OUTPUT);
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus ArmSoftmaxInt8Op::Run() {
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+  // TODO(ZhangZhimin): Workarounds for AUTO data format
+  if (NHWC == data_format_ || AUTO == data_format_) {  // NHWC
+    return RunForNHWC();
+  } else {
+    MACE_NOT_IMPLEMENTED;
+    return MACE_UNSUPPORTED;
+  }
+}
+
+MaceStatus ArmSoftmaxInt8Op::RunForNHWC() {
+  int32_t class_size = input_dims_[input_dim_size_ - 1];
+
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(input_);
+  int8_t *output_data = reinterpret_cast<int8_t *>(output_);
+
+  int32_t num_rows = input_dims_[0];
+
+  QuantizeInfo input_quantize_info = GetInputQuantizeInfo(INPUT);
+
+  int kInputDeltaIntBits = 5;
+  int32_t scale_q = static_cast<int32_t>(
+      base::min(static_cast<double>(input_quantize_info.scale) *
+                    (1 << (31 - kInputDeltaIntBits)),
+                (1ll << 31) - 1.0));
+  int32_t mult;
+  int32_t shift;
+  QuantizeMultiplier(scale_q, &mult, &shift);
+  int32_t diff_min = -128;
+
+  arm_softmax_s8(input_data, num_rows, class_size, mult, shift, diff_min,
+                 output_data);
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/arm_softmax_int8.h
+++ b/micro/ops/nhwc/cmsis_nn/arm_softmax_int8.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_ARM_SOFTMAX_INT8_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_ARM_SOFTMAX_INT8_H_
+
+#include "micro/framework/operator.h"
+
+namespace micro {
+namespace ops {
+
+class ArmSoftmaxInt8Op : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+
+ private:
+  MaceStatus RunForNHWC();
+
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+
+  mifloat *output_;
+
+  DataFormat data_format_;
+
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_ARM_SOFTMAX_INT8_H_
--- a/micro/ops/nhwc/cmsis_nn/dequantize.cc
+++ b/micro/ops/nhwc/cmsis_nn/dequantize.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/dequantize.h"
+
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+#include "micro/framework/op_context.h"
+#include "micro/framework/operator.h"
+#include "micro/model/net_def.h"
+
+namespace micro {
+namespace ops {
+
+MaceStatus DequantizeOp::OnInit() {
+  input_ = GetInputData<int8_t>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+
+  output_ = GetOutputData<mifloat>(OUTPUT);
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus DequantizeOp::Run() {
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+
+  const micro::OpIOInfo *input_info = op_context_->input_info(INPUT);
+
+  QuantizeInfo input_quantize_info = GetInputQuantizeInfo(INPUT);
+
+  float scale = input_quantize_info.scale;
+  int32_t zero_point = input_quantize_info.zero;
+
+  int32_t element_size = 1;
+  for (uint32_t i = 0; i < input_dim_size_; ++i) {
+    element_size *= input_dims_[i];
+  }
+  for (int32_t i = 0; i < element_size; ++i) {
+    output_[i] = scale * (input_[i] - zero_point);
+  }
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/dequantize.h
+++ b/micro/ops/nhwc/cmsis_nn/dequantize.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_DEQUANTIZE_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_DEQUANTIZE_H_
+
+#include "micro/framework/operator.h"
+
+namespace micro {
+namespace ops {
+
+class DequantizeOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+
+ private:
+  const int8_t *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+
+  mifloat *output_;
+
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_DEQUANTIZE_H_
+
--- a/micro/ops/nhwc/cmsis_nn/quantize.cc
+++ b/micro/ops/nhwc/cmsis_nn/quantize.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/quantize.h"
+
+#include <cmath>
+#include "micro/base/logging.h"
+#include "micro/base/utils.h"
+
+namespace micro {
+namespace ops {
+
+inline int8_t SaturateInt8(float value) {
+  int rounded_value = static_cast<int>(value);
+  if (rounded_value <= -128) {
+    return -128;
+  } else if (rounded_value >= 127) {
+    return 127;
+  } else {
+    return static_cast<int8_t>(rounded_value);
+  }
+}
+
+MaceStatus QuantizeOp::OnInit() {
+  input_ = GetInputData<mifloat>(INPUT);
+  input_dims_ = GetInputShapeDims(INPUT);
+  input_dim_size_ = GetInputShapeDimSize(INPUT);
+
+  output_ = GetOutputData<int8_t>(OUTPUT);
+
+  return MACE_SUCCESS;
+}
+
+MaceStatus QuantizeOp::Run() {
+  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
+  QuantizeInfo output_quantize_info = GetOutputQuantizeInfo(OUTPUT);
+  float recip_scale = 1.0f / output_quantize_info.scale;
+  int32_t zero_point = output_quantize_info.zero;
+
+  int32_t element_size = 1;
+  for (uint32_t i = 0; i < input_dim_size_; ++i) {
+    element_size *= input_dims_[i];
+  }
+
+  for (int32_t i = 0; i < element_size; ++i) {
+    output_[i] = SaturateInt8(roundf(recip_scale * input_[i] + zero_point));
+  }
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace ops
+}  // namespace micro
--- a/micro/ops/nhwc/cmsis_nn/quantize.h
+++ b/micro/ops/nhwc/cmsis_nn/quantize.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_QUANTIZE_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_QUANTIZE_H_
+
+#include "micro/framework/operator.h"
+
+namespace micro {
+namespace ops {
+
+class QuantizeOp : public framework::Operator {
+ public:
+  MaceStatus OnInit();
+  MaceStatus Run();
+
+ private:
+  const mifloat *input_;
+  const int32_t *input_dims_;
+  uint32_t input_dim_size_;
+
+  int8_t *output_;
+
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_QUANTIZE_H_
--- a/micro/ops/nhwc/cmsis_nn/utilities.cc
+++ b/micro/ops/nhwc/cmsis_nn/utilities.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/nhwc/cmsis_nn/utilities.h"
+
+#include <math.h>
+
+void QuantizeMultiplier(double double_multiplier,
+                        int32_t *quantized_multiplier,
+                        int32_t *shift) {
+  if (double_multiplier == 0.) {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = frexp(double_multiplier, reinterpret_cast<int *>(shift));
+  int64_t q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
+
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+
+  if (*shift < -31) {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
--- a/micro/ops/nhwc/cmsis_nn/utilities.h
+++ b/micro/ops/nhwc/cmsis_nn/utilities.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_NHWC_CMSIS_NN_UTILITIES_H_
+#define MICRO_OPS_NHWC_CMSIS_NN_UTILITIES_H_
+
+#include "micro/base/types.h"
+
+void QuantizeMultiplier(double double_multiplier,
+                        int32_t *quantized_multiplier,
+                        int32_t *shift);
+
+#endif  // MICRO_OPS_NHWC_CMSIS_NN_UTILITIES_H_
--- a/micro/ops/reshape.cc
+++ b/micro/ops/reshape.cc
-// Copyright 2020 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "micro/ops/reshape.h"
-
-#include "micro/base/logging.h"
-#include "micro/base/utils.h"
-#include "micro/framework/scratch_buffer.h"
-
-namespace micro {
-namespace ops {
-
-namespace {
-
-MaceStatus ValidShapeData(const int32_t *input_dims,
-                          const uint32_t input_dim_size,
-                          int32_t *shape_data,
-                          const uint32_t shape_data_size) {
-  MACE_ASSERT(
-      input_dims != NULL && shape_data != NULL);
-  int32_t unknown_idx = -1;
-  int32_t product = 1;
-  const int32_t input_size = base::GetShapeSize(input_dim_size, input_dims);
-
-  for (uint32_t i = 0; i < shape_data_size; ++i) {
-    if (shape_data[i] == -1) {
-      MACE_ASSERT1(unknown_idx == -1, "Only one input size may be -1");
-      unknown_idx = i;
-      shape_data[i] = 1;
-    } else {
-      MACE_ASSERT2(shape_data[i] >= 0, "Shape must be non-negative: ",
-                   shape_data[i]);
-      if (shape_data[i] == 0) {
-        MACE_ASSERT1(i < input_dim_size, "dims:0 out of input dims' range.");
-        shape_data[i] = input_dims[i];
-      }
-      product *= shape_data[i];
-    }
-  }
-
-  if (unknown_idx != -1) {
-    MACE_ASSERT1(product != 0,
-                 "Cannot infer shape if there is zero shape size.");
-    const int32_t missing = input_size / product;
-    MACE_ASSERT1(missing * product == input_size,
-                 "Input size not match reshaped tensor size");
-    shape_data[unknown_idx] = missing;
-  }
-
-  return MACE_SUCCESS;
-}
-
-}  // namespace
-
-MaceStatus ReshapeOp::OnInit() {
-  input_ = GetInputData<mifloat>(INPUT);
-  input_dims_ = GetInputShapeDims(INPUT);
-  input_dim_size_ = GetInputShapeDimSize(INPUT);
-
-  shape_ = GetInputData<int32_t>(SHAPE);
-  shape_dims_ = GetInputShapeDims(SHAPE);
-  shape_dim_size_ = GetInputShapeDimSize(SHAPE);
-
-  output_ = GetOutputData<mifloat>(OUTPUT);
-  return MACE_SUCCESS;
-}
-
-MaceStatus ReshapeOp::Run() {
-  const int32_t input_data_size =
-      base::GetShapeSize(input_dim_size_, input_dims_);
-  const int32_t shape_data_size =
-      base::GetShapeSize(shape_dim_size_, shape_dims_);
-
-  int32_t *shape_data =
-      ScratchBuffer(engine_config_).GetBuffer<int32_t>(shape_data_size);
-  base::memcpy(shape_data, shape_, shape_data_size * sizeof(int32_t));
-
-  MACE_RETURN_IF_ERROR(ValidShapeData(input_dims_, input_dim_size_,
-                                      shape_data, shape_data_size));
-
-#ifndef MACE_MICRO_NDEBUG
-  const int32_t output_data_size = base::accumulate_multi(
-      shape_data, 0, static_cast<uint32_t>(shape_data_size));
-  if (input_data_size != output_data_size) {
-    LOG(FATAL) << "input_data_size(" << input_data_size
-               << ") != output_data_size(" << output_data_size
-               << "), please check the model.";
-  }
-#endif
-
-  // TODO(luxuhui): optimize this method by reusing buffer
-  base::memcpy(output_, input_, input_data_size * sizeof(mifloat));
-  return ResizeOutputShape(OUTPUT, shape_data_size, shape_data);
-}
-
-}  // namespace ops
-}  // namespace micro
--- a/micro/ops/reshape.h
+++ b/micro/ops/reshape.h
@@ -15,17 +15,104 @@
 #ifndef MICRO_OPS_RESHAPE_H_
 #define MICRO_OPS_RESHAPE_H_

+#include "micro/base/utils.h"
 #include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"

 namespace micro {
 namespace ops {
+
+namespace internal {
+
+inline MaceStatus ValidShapeData(const int32_t *input_dims,
+                                 const uint32_t input_dim_size,
+                                 int32_t *shape_data,
+                                 const uint32_t shape_data_size) {
+  MACE_ASSERT(input_dims != NULL && shape_data != NULL);
+  int32_t unknown_idx = -1;
+  int32_t product = 1;
+  const int32_t input_size = base::GetShapeSize(input_dim_size, input_dims);
+
+  for (uint32_t i = 0; i < shape_data_size; ++i) {
+    if (shape_data[i] == -1) {
+      MACE_ASSERT1(unknown_idx == -1, "Only one input size may be -1");
+      unknown_idx = i;
+      shape_data[i] = 1;
+    } else {
+      MACE_ASSERT2(shape_data[i] >= 0,
+                   "Shape must be non-negative: ", shape_data[i]);
+      if (shape_data[i] == 0) {
+        MACE_ASSERT1(i < input_dim_size, "dims:0 out of input dims' range.");
+        shape_data[i] = input_dims[i];
+      }
+      product *= shape_data[i];
+    }
+  }
+
+  if (unknown_idx != -1) {
+    MACE_ASSERT1(product != 0,
+                 "Cannot infer shape if there is zero shape size.");
+    const int32_t missing = input_size / product;
+    MACE_ASSERT1(missing * product == input_size,
+                 "Input size not match reshaped tensor size");
+    shape_data[unknown_idx] = missing;
+  }
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace internal
+
+
+template <typename T>
 class ReshapeOp : public framework::Operator {
 public:
-  MaceStatus OnInit();
-  MaceStatus Run();
+  typedef T value_type;
+
+  MaceStatus OnInit() {
+    input_ = GetInputData<ReshapeOp::value_type>(INPUT);
+    input_dims_ = GetInputShapeDims(INPUT);
+    input_dim_size_ = GetInputShapeDimSize(INPUT);
+
+    shape_ = GetInputData<int32_t>(SHAPE);
+    shape_dims_ = GetInputShapeDims(SHAPE);
+    shape_dim_size_ = GetInputShapeDimSize(SHAPE);
+
+    output_ = GetOutputData<ReshapeOp::value_type>(OUTPUT);
+    return MACE_SUCCESS;
+  }
+
+  MaceStatus Run() {
+    const int32_t input_data_size =
+        base::GetShapeSize(input_dim_size_, input_dims_);
+    const int32_t shape_data_size =
+        base::GetShapeSize(shape_dim_size_, shape_dims_);
+
+    int32_t *shape_data =
+        ScratchBuffer(engine_config_).GetBuffer<int32_t>(shape_data_size);
+    base::memcpy(shape_data, shape_, shape_data_size * sizeof(int32_t));
+
+    MACE_RETURN_IF_ERROR(internal::ValidShapeData(input_dims_, input_dim_size_,
+                                                  shape_data, shape_data_size));
+
+#ifndef MACE_MICRO_NDEBUG
+    const int32_t output_data_size = base::accumulate_multi(
+        shape_data, 0, static_cast<uint32_t>(shape_data_size));
+    if (input_data_size != output_data_size) {
+      LOG(FATAL) << "input_data_size(" << input_data_size
+                 << ") != output_data_size(" << output_data_size
+                 << "), please check the model.";
+    }
+#endif
+
+    // TODO(luxuhui): optimize this method by reusing buffer
+    base::memcpy(output_, input_,
+                 input_data_size * sizeof(ReshapeOp::value_type));
+    return ResizeOutputShape(OUTPUT, shape_data_size, shape_data);
+  }

 private:
-  const mifloat *input_;
+  const value_type *input_;
  const int32_t *input_dims_;
  uint32_t input_dim_size_;

@@ -33,7 +120,7 @@ class ReshapeOp : public framework::Operator {
  const int32_t *shape_dims_;
  uint32_t shape_dim_size_;

-  mifloat *output_;
+  value_type *output_;

  MACE_OP_INPUT_TAGS(INPUT, SHAPE);
  MACE_OP_OUTPUT_TAGS(OUTPUT);

--- a/micro/ops/softmax.cc
+++ b/micro/ops/softmax.cc
@@ -36,7 +36,8 @@ MaceStatus SoftmaxOp::OnInit() {

 MaceStatus SoftmaxOp::Run() {
  MACE_RETURN_IF_ERROR(ResizeOutputShape(OUTPUT, input_dim_size_, input_dims_));
-  if (NHWC == data_format_) {  // NHWC
+  // TODO(ZhangZhimin): Walkarounds for AUTO data format
+  if (NHWC == data_format_ || AUTO == data_format_) {  // NHWC
    return RunForNHWC();
  } else {
    MACE_NOT_IMPLEMENTED;

--- a/micro/pretrained_models/keras/README.md
+++ b/micro/pretrained_models/keras/README.md
+# Tensorflow Keras Models
+
+MACE Micro supports Keras models of Tensorflow 2.x
+
+## HAR
+
+The model is from <https://github.com/Shahnawax/HAR-CNN-Keras/>.
+
+## MNIST
+
+The mnist_keras.py depends on tensorflow 2.x and tensorflow_model_optimization. You can run this script to generate "mnist.h5" and "mnist-int8.h5" models
--- a/micro/pretrained_models/keras/har/har-int8.yml
+++ b/micro/pretrained_models/keras/har/har-int8.yml
+library_name: har
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  har_int8:
+    platform: keras
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/har/har.h5
+    model_sha256_checksum: ec0477b8e489541bb34377c9cabc42ee6cefa8bdf0a9f726e06be1b967ea1dcd
+    subgraphs:
+      - input_tensors:
+          - conv2d_1_input:0
+        input_shapes:
+          - 1,90,3,1
+        input_ranges:
+          - -5,15
+        output_tensors:
+          - dense_3/Softmax:0
+        output_shapes:
+          - 1,6
+    runtime: cpu
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
+    quantize: 1
+    quantize_schema: int8
+    quantize_range_file: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/har/har.range
--- a/micro/pretrained_models/keras/har/har.yml
+++ b/micro/pretrained_models/keras/har/har.yml
+library_name: har
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  har:
+    platform: keras
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/har/har.h5
+    model_sha256_checksum: ec0477b8e489541bb34377c9cabc42ee6cefa8bdf0a9f726e06be1b967ea1dcd
+    subgraphs:
+      - input_tensors:
+          - conv2d_1_input:0
+        input_shapes:
+          - 1,90,3,1
+        output_tensors:
+          - dense_3/Softmax:0
+        output_shapes:
+          - 1,6
+    runtime: cpu
+    data_type: fp32_fp32
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
--- a/micro/pretrained_models/keras/mnist/mnist-int8.yml
+++ b/micro/pretrained_models/keras/mnist/mnist-int8.yml
+library_name: mnist
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  mnist_int8:
+    platform: keras
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/mnist/mnist_int8.h5
+    model_sha256_checksum: f56ae3b94c114719683c3bc55351f871d371e874d3a4d3224cc5299717e8b7fc
+    subgraphs:
+      - input_tensors:
+          - conv2d_input:0
+        input_shapes:
+          - 1,28,28,1
+        input_ranges:
+          - 0,1
+        output_tensors:
+          - quant_dense_1/Softmax:0
+        output_shapes:
+          - 1,10
+    runtime: cpu
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
+    quantize: 1
+    quantize_schema: int8
--- a/micro/pretrained_models/keras/mnist/mnist.yml
+++ b/micro/pretrained_models/keras/mnist/mnist.yml
+library_name: mnist
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  mnist:
+    platform: keras
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/keras/mnist/mnist.h5
+    model_sha256_checksum: 85f2ffe02e1b9dd2d6ad3826b91ac134fed15b838bb92a1010f67c19d55b1f65
+    subgraphs:
+      - input_tensors:
+          - conv2d_input:0
+        input_shapes:
+          - 1,28,28,1
+        output_tensors:
+          - dense_1/Softmax:0
+        output_shapes:
+          - 1,10
+    runtime: cpu
+    data_type: fp32_fp32
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
+    quantize: 0
--- a/micro/pretrained_models/keras/mnist/mnist_keras.py
+++ b/micro/pretrained_models/keras/mnist/mnist_keras.py
+# Refer to https://www.tensorflow.org/model_optimization/guide
+
+import tensorflow.compat.v2 as tf
+import tensorflow_datasets as tfds
+import tensorflow_model_optimization as tfmot
+
+
+def normalize_img(image, label):
+    """Normalizes images: `uint8` -> `float32`."""
+    return tf.cast(image, tf.float32) / 255.0, label
+
+
+tfds.disable_progress_bar()
+tf.enable_v2_behavior()
+
+(ds_train, ds_test), ds_info = tfds.load(
+    "mnist",
+    split=["train", "test"],
+    shuffle_files=True,
+    as_supervised=True,
+    with_info=True,
+)
+
+ds_train = ds_train.map(
+    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE
+)
+ds_train = ds_train.cache()
+ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
+ds_train = ds_train.batch(128)
+ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)
+
+ds_test = ds_test.map(
+    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE
+)
+ds_test = ds_test.batch(128)
+ds_test = ds_test.cache()
+ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)
+
+model = tf.keras.models.Sequential(
+    [
+        tf.keras.layers.Conv2D(
+            filters=32, kernel_size=3, activation="relu", padding="same"
+        ),
+        tf.keras.layers.DepthwiseConv2D(
+            kernel_size=3, activation="relu", padding="same"
+        ),
+        tf.keras.layers.MaxPool2D(pool_size=2),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(128, activation="relu"),
+        tf.keras.layers.Dense(10, activation="softmax"),
+    ]
+)
+model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=tf.keras.optimizers.Adam(0.001),
+    metrics=["accuracy"],
+)
+
+model.fit(
+    ds_train,
+    epochs=6,
+    validation_data=ds_test,
+)
+
+model.save("mnist.h5")
+
+quantize_model = tfmot.quantization.keras.quantize_model
+
+quantization_aware_model = quantize_model(model)
+
+quantization_aware_model.compile(
+    optimizer="adam",
+    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+
+quantization_aware_model.fit(
+    ds_train,
+    epochs=6,
+    validation_data=ds_test,
+)
+
+quantization_aware_model.save("mnist-int8.h5")
--- a/micro/pretrained_models/tensorflow/README.md
+++ b/micro/pretrained_models/tensorflow/README.md
+# Tensorflow frozen models
+
+## KWS
+
+The model is from <https://github.com/hyperconnect/TC-ResNet/>.
--- a/micro/pretrained_models/tensorflow/kws/kws-tc_resnet8-bf16.yml
+++ b/micro/pretrained_models/tensorflow/kws/kws-tc_resnet8-bf16.yml
+library_name: kws-tc_resnet8
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  kws_tc_resnet8_bf16:
+    platform: tensorflow
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/tensorflow/kws/kws-tc_resnet8.pb
+    model_sha256_checksum: c552cf79cb64d3c755ae7d867c1c78b13f55f7589d46def1f70ce657c0db0d79
+    subgraphs:
+      - input_tensors:
+          - input
+        input_shapes:
+          - 1,98,40,1
+        output_tensors:
+          - output/softmax
+        output_shapes:
+          - 1,12
+    runtime: cpu
+    data_type: bf16_fp32
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
+    quantize: 0
--- a/micro/pretrained_models/tensorflow/kws/kws-tc_resnet8.yml
+++ b/micro/pretrained_models/tensorflow/kws/kws-tc_resnet8.yml
+library_name: kws-tc_resnet8
+target_abis: [host]
+model_graph_format: file
+model_data_format: file
+models:
+  kws_tc_resnet8:
+    platform: tensorflow
+    model_file_path: https://cdn.cnbj1.fds.api.mi-img.com/mace/miai-models/micro/tensorflow/kws/kws-tc_resnet8.pb
+    model_sha256_checksum: c552cf79cb64d3c755ae7d867c1c78b13f55f7589d46def1f70ce657c0db0d79
+    subgraphs:
+      - input_tensors:
+          - input
+        input_shapes:
+          - 1,98,40,1
+        output_tensors:
+          - output/softmax
+        output_shapes:
+          - 1,12
+    runtime: cpu
+    data_type: fp32_fp32
+    limit_opencl_kernel_time: 0
+    nnlib_graph_mode: 0
+    obfuscate: 0
+    winograd: 0
+    quantize: 0
--- a/micro/test/CMakeLists.txt
+++ b/micro/test/CMakeLists.txt
 add_subdirectory(ccutils)

 if(NOT HEXAGON)
-  include(${PROJECT_SOURCE_DIR}/third_party/googletest/googletest.cmake)
  add_subdirectory(ccunit)
 endif()


--- a/micro/test/ccunit/CMakeLists.txt
+++ b/micro/test/ccunit/CMakeLists.txt
-
-
 add_executable(micro_ops_test
  micro/ops/stack_test.cc
  micro/ops/reshape_test.cc
@@ -20,6 +18,7 @@ add_executable(micro_ops_test
  micro/ops/softmax_test.cc
  micro/ops/bias_add_test.cc
  micro/ops/expand_dims_test.cc
+  micro/ops/concat_test.cc
 )
 target_link_libraries(micro_ops_test
  PRIVATE micro_base
@@ -36,7 +35,8 @@ if(MICRO_MODEL_NAME)
    micro/codegen/engine_test.cc
  )
  target_link_libraries(micro_cc_test
-    micro_engine
+    micro
+    models
    gtest
    gtest_main
  )

--- a/micro/test/ccunit/micro/codegen/engine_test.cc
+++ b/micro/test/ccunit/micro/codegen/engine_test.cc
@@ -33,8 +33,9 @@ class EngineTest : public ::testing::Test {

 void OutputAllInfo() {
  MaceMicroEngine *micro_engine = NULL;
-  MACE_ASSERT(MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine)
-                  == MACE_SUCCESS && micro_engine != NULL);
+  MACE_ASSERT(MICRO_MODEL_NAME::GetMicroEngineSingleton(&micro_engine) ==
+                  MACE_SUCCESS &&
+              micro_engine != NULL);

  float input_buffer[1 * 1 * 128 * 9] = {0};
  int32_t input_shape[] = {1, 1, 128, 9};

--- a/micro/test/ccunit/micro/ops/reshape_test.cc
+++ b/micro/test/ccunit/micro/ops/reshape_test.cc
@@ -33,7 +33,7 @@ void TestReshapeOp(
    T *y, int32_t *y_dims, const uint32_t y_dim_size,
    const T *e, const int32_t *e_dims, const uint32_t e_dim_size) {

-  ReshapeOp reshape_op;
+  ReshapeOp<T> reshape_op;
  framework::SubstituteOp substitude_op;
  substitude_op.AddInput(input, input_dims, input_dim_size)
      .AddInput(shape, shape_dims, 1)

--- a/CMSIS_5 @ 378acfb6
+++ b/CMSIS_5 @ 378acfb6
+Subproject commit 378acfb6490a82ba90e1ffb4bfd4e602668b180a
--- a/gflags @ a386bd0f
+++ b/gflags @ a386bd0f
+Subproject commit a386bd0f204cf99db253b3e84c56795dea8c397f
--- a/micro/third_party/gflags/COPYING.txt
+++ b/micro/third_party/gflags/COPYING.txt
-Copyright (c) 2006, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/micro/third_party/gflags/gflags.cmake
+++ b/micro/third_party/gflags/gflags.cmake
-INCLUDE(ExternalProject)
-
-set(GFLAGS_SRCS_DIR    "${MACE_THIRD_PARTY_DIR}/gflags")
-set(GFLAGS_INSTALL_DIR "${MACE_THIRD_PARTY_DIR}/install/gflags")
-set(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
-
-if(MSVC)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-else(MSVC)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-endif(MSVC)
-
-include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR})
-
-# Mirror of https://github.com/gflags/gflags/archive/v2.2.2.zip
-set(GFLAGS_URL     "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gflags/v2.2.2.zip")
-set(GFLAGS_HASH    "SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5")
-
-ExternalProject_Add(
-  gflags_gflags
-  URL_HASH         "${GFLAGS_HASH}"
-  URL              "${GFLAGS_URL}"
-  PREFIX           ${GFLAGS_SRCS_DIR}
-  UPDATE_COMMAND   ""
-  BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
-  CMAKE_ARGS       -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                   -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
-                   -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
-                   -DBUILD_STATIC_LIBS=ON
-                   -DBUILD_TESTING=OFF
-		   -DCMAKE_BUILD_TYPE=Release
-                   -DCMAKE_GENERATOR=${CMAKE_GENERATOR}
-                   ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-)
-
-if(MSVC)
-  add_custom_command(TARGET gflags_gflags POST_BUILD
-    COMMAND if $<CONFIG:Debug>==1 (${CMAKE_COMMAND} -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static_debug.lib ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib)
-  )
-endif(MSVC)
-
-add_library(gflags STATIC IMPORTED GLOBAL)
-set_property(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
-add_dependencies(gflags gflags_gflags)
-
-if(MSVC)
-  set_target_properties(gflags
-    PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES
-    Shlwapi.lib)
-endif(MSVC)
--- a/googletest @ e6e2d3b7
+++ b/googletest @ e6e2d3b7
+Subproject commit e6e2d3b7614ff4e6017d8968bd4c3f579133666e
--- a/micro/third_party/googletest/LICENSE
+++ b/micro/third_party/googletest/LICENSE
-Copyright 2008, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/micro/third_party/googletest/googletest.BUILD
+++ b/micro/third_party/googletest/googletest.BUILD
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "gtest",
-    srcs = [
-        "googletest/src/gtest-all.cc",
-        "googlemock/src/gmock-all.cc",
-    ],
-    hdrs = glob([
-        "**/*.h",
-        "googletest/src/*.cc",
-        "googlemock/src/*.cc",
-    ]),
-    includes = [
-        "googlemock",
-        "googletest",
-        "googletest/include",
-        "googlemock/include",
-    ],
-    linkopts = ["-pthread"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gtest_main",
-    srcs = ["googlemock/src/gmock_main.cc"],
-    linkopts = ["-pthread"],
-    visibility = ["//visibility:public"],
-    deps = [":gtest"],
-)
--- a/micro/third_party/googletest/googletest.cmake
+++ b/micro/third_party/googletest/googletest.cmake
-  enable_testing()
-
-  include(ExternalProject)
-
-  set(GTEST_SOURCES_DIR ${MACE_THIRD_PARTY_DIR}/gtest)
-  set(GTEST_INSTALL_DIR ${MACE_THIRD_PARTY_DIR}/install/gtest)
-  set(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-
-  include_directories(SYSTEM ${GTEST_INCLUDE_DIR})
-
-  if(MSVC)
-    set(GTEST_LIBRARIES
-      "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-      "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
-  else(MSVC)
-    set(GTEST_LIBRARIES
-      "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-      "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
-  endif(MSVC)
-
-  # Mirror of "https://github.com/google/googletest/archive/release-1.8.0.zip"
-  set(GTEST_URL  "https://cnbj1.fds.api.xiaomi.com/mace/third-party/googletest/googletest-release-1.8.0.zip")
-  set(GTEST_HASH "SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf")
-
-  ExternalProject_Add(
-    extern_gtest
-    URL_HASH         "${GTEST_HASH}"
-    URL              "${GTEST_URL}"
-    PREFIX           ${GTEST_SOURCES_DIR}
-    UPDATE_COMMAND   ""
-    BUILD_BYPRODUCTS ${GTEST_LIBRARIES} ${GTEST_MAIN_LIBRARIES}
-    CMAKE_ARGS       -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                     -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
-                     -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
-                     -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-                     -DBUILD_GMOCK=ON
-                     -Dgtest_disable_pthreads=ON
-                     -Dgtest_force_shared_crt=ON
-                     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                     -DCMAKE_GENERATOR=${CMAKE_GENERATOR}
-                     ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-  )
-
-  add_library(gtest STATIC IMPORTED GLOBAL)
-  set_property(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
-  add_dependencies(gtest extern_gtest)
-
-  add_library(gtest_main STATIC IMPORTED GLOBAL)
-  set_property(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
-  add_dependencies(gtest_main extern_gtest)
--- a/micro/third_party/third_party.cmake
+++ b/micro/third_party/third_party.cmake
-set(MACE_THIRD_PARTY_DIR "${PROJECT_BINARY_DIR}/third_party" CACHE STRING "Third party libraries download & build directories.")
-
-# Forwarding the cross compile flags
-set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-  -DCMAKE_C_FLAGS=${MACE_CC_FLAGS}
-  -DCMAKE_CXX_FLAGS=${MACE_CC_FLAGS}
-)
-
-if(CMAKE_TOOLCHAIN_FILE)
-  set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-      ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-      -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-  )
-endif(CMAKE_TOOLCHAIN_FILE)
-
-if(CROSSTOOL_ROOT)
-  set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-      ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-      -DCROSSTOOL_ROOT=${CROSSTOOL_ROOT}
-  )
-endif(CROSSTOOL_ROOT)
-
-if(ANDROID_ABI)
-  set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-      ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-      -DANDROID_ABI=${ANDROID_ABI}
-  )
-endif(ANDROID_ABI)
-
-if(ANDROID_NATIVE_API_LEVEL)
-  set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-      ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-      -DANDROID_NATIVE_API_LEVEL=${ANDROID_NATIVE_API_LEVEL}
-  )
-endif(ANDROID_NATIVE_API_LEVEL)
-
-if(PLATFORM)
-  set(THIRD_PARTY_EXTRA_CMAKE_ARGS
-      ${THIRD_PARTY_EXTRA_CMAKE_ARGS}
-      -DPLATFORM=${PLATFORM}
-  )
-endif(PLATFORM)
--- a/micro/tools/CMakeLists.txt
+++ b/micro/tools/CMakeLists.txt
 if(MICRO_MODEL_NAME)
-  include (${PROJECT_SOURCE_DIR}/third_party/gflags/gflags.cmake)
  add_executable(micro_run_static micro_run.cc)
-  target_link_libraries(micro_run_static micro_engine gflags)
+  target_link_libraries(micro_run_static  micro models  gflags)
+  target_compile_options(micro_run_static PRIVATE "-std=c++11")
  target_compile_definitions(micro_run_static PRIVATE "-DMICRO_MODEL_NAME=${MICRO_MODEL_NAME}")
  if(NOT ANDROID)
    target_link_libraries(micro_run_static pthread)

--- a/micro/tools/build_docker.sh
+++ b/micro/tools/build_docker.sh
+#! /bin/bash
+
+cd docker/mace-micro-dev
+
+docker build . -f mace-micro-dev.dockerfile --tag mace-micro-dev
+
+cd ../..
\ No newline at end of file
--- a/micro/tools/ci/build_mbed_example.sh
+++ b/micro/tools/ci/build_mbed_example.sh
+#! /bin/bash
+
+python tools/python/convert.py --config micro/pretrained_models/tensorflow/kws/kws-tc_resnet8.yml --enable_micro || exit -1
+
+./micro/tools/cmake/cmake-build-gcc-arm-none-eabi.sh \
+-DARM_CPU=cortex-m7 \
+-DMACE_MICRO_ENABLE_CMSIS=ON \
+-DMACE_MICRO_ENABLE_HARDFP=OFF || exit -1
+
+cp build/micro/gcc-arm-none-eabi/install micro/examples/classifier -r
+
+cd micro/examples/classifier
+
+mbed deploy || exit -1
+mbed compile -t GCC_ARM -m NUCLEO_F767ZI -D MICRO_MODEL_NAME=kws_tc_resnet8 -D MICRO_DATA_NAME=kws || exit -1
+
+cd ../../..
\ No newline at end of file
--- a/micro/tools/ci/cross_build.sh
+++ b/micro/tools/ci/cross_build.sh
+#! /bin/bash
+
+git submodule update --init .
+
+echo "Builds host float32"
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-host.sh \
+-DMACE_MICRO_ENABLE_TESTS=ON \
+-DMACE_MICRO_ENABLE_CMSIS=ON || exit -1
+
+echo "Builds host bfloat16"
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-host.sh \
+-DMACE_MICRO_ENABLE_BFLOAT16=ON \
+-DMACE_MICRO_ENABLE_TESTS=ON \
+-DMACE_MICRO_ENABLE_CMSIS=ON || exit -1
+
+echo "Builds gcc arm cortex-m7"
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-gcc-arm-none-eabi.sh \
+-DARM_CPU=cortex-m7 \
+-DMACE_MICRO_ENABLE_TESTS=OFF \
+-DMACE_MICRO_ENABLE_CMSIS=ON  || exit -1
+
+cd ..
\ No newline at end of file
--- a/micro/tools/ci/host_build_and_run_examples.sh
+++ b/micro/tools/ci/host_build_and_run_examples.sh
+#! /bin/bash
+
+python tools/python/convert.py --config micro/pretrained_models/tensorflow/kws/kws-tc_resnet8.yml --enable_micro || exit -1
+
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-host.sh \
+-DMACE_MICRO_ENABLE_EXAMPLES=ON  -DMICRO_MODEL_NAME=kws_tc_resnet8 -DMICRO_DATA_NAME=kws \
+-DMACE_MICRO_ENABLE_TESTS=OFF \
+-DMACE_MICRO_ENABLE_CMSIS=OFF || exit -1
+
+./build/micro/host/examples/classifier/kws_tc_resnet8
+
+python3 tools/python/convert.py --config micro/pretrained_models/keras/mnist/mnist-int8.yml --enable_micro || exit -1
+
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-host.sh \
+-DMACE_MICRO_ENABLE_CMSIS=ON \
+-DMACE_MICRO_ENABLE_EXAMPLES=ON \
+-DMICRO_MODEL_NAME=mnist_int8 -DMICRO_DATA_NAME=mnist \
+-DMACE_MICRO_ENABLE_TESTS=OFF || exit -1
+
+./build/micro/host/examples/classifier/mnist_int8
+
+cd ..
\ No newline at end of file
--- a/micro/tools/ci/host_build_and_run_tests.sh
+++ b/micro/tools/ci/host_build_and_run_tests.sh
+#! /bin/bash
+
+git submodule update --init .
+
+rm -rf build/micro
+./micro/tools/cmake/cmake-build-host.sh \
+-DMACE_MICRO_ENABLE_TESTS=ON \
+-DMACE_MICRO_ENABLE_CMSIS=ON || exit -1
+
+echo "MACE Micro ut"
+build/micro/host/test/ccunit/micro_ops_test || exit -1
+
+echo "MACE Micro benchmark"
+build/micro/host/test/ccbenchmark/micro_cc_benchmark || exit -1
+
+cd ..
\ No newline at end of file
--- a/micro/tools/ci/model_convert.sh
+++ b/micro/tools/ci/model_convert.sh
+#! /bin/bash
+
+rm -rf mace-models
+rm -rf build/micro
+
+GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@git.n.xiaomi.com:applied-machine-learning/sysml/mace-models.git
+
+git submodule update --init . || exit -1
+
+CONF_FILE=mace-models/micro-models/har-cnn/har-cnn.yml
+python tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn || exit -1
+python tools/python/run_micro.py --config $CONF_FILE --model_name har_cnn --build --benchmark || exit -1
+
+CONF_FILE=mace-models/micro-models/har-cnn/har-cnn-bf16.yml
+python tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_cnn || exit -1
+
+CONF_FILE=mace-models/micro-models/keras/mnist/mnist.yml
+python3 tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python3 tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name mnist || exit -1
+
+CONF_FILE=mace-models/micro-models/keras/mnist/mnist-int8.yml
+python3 tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python3 tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name mnist_int8 || exit -1
+
+CONF_FILE=mace-models/micro-models/keras/har/har.yml
+python3 tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python3 tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har || exit -1
+
+CONF_FILE=mace-models/micro-models/keras/har/har-int8.yml
+python3 tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python3 tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name har_int8 || exit -1
+
+CONF_FILE=mace-models/micro-models/tensorflow/kws/kws-tc_resnet8.yml
+python tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name kws_tc_resnet8 || exit -1
+
+CONF_FILE=mace-models/micro-models/tensorflow/kws/kws-tc_resnet8-bf16.yml
+python tools/python/convert.py --config=${CONF_FILE} --enable_micro || exit -1
+python tools/python/run_micro.py --config $CONF_FILE --build --validate --model_name kws_tc_resnet8_bf16 || exit -1
+
+rm -rf mace-models
--- a/micro/tools/cmake/cmake-build-arm64-v8a-hexagon-stub.sh
+++ b/micro/tools/cmake/cmake-build-arm64-v8a-hexagon-stub.sh
 #!/bin/bash
+
 if [ -z "$ANDROID_NDK_HOME" ]; then
  echo "ANDROID_NDK_HOME is undefined";
+  exit -1;
 fi

 if [ -z "$HEXAGON_SDK_ROOT" ]; then
  echo "HEXAGON_SDK_ROOT is undefined";
+  exit -1;
 fi

-BUILD_DIR=build/cmake-build/arm64-v8a
+BUILD_DIR=build/micro/arm64-v8a
 mkdir -p ${BUILD_DIR}
 cd ${BUILD_DIR}

-cmake ../../.. \
+cmake ../../../micro \
  -DANDROID_ABI="arm64-v8a" \
  -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
  -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} \
@@ -20,9 +23,10 @@ cmake ../../.. \
  -DANDROID_STL=c++_shared            \
  -DMACE_ENABLE_RPCMEM=ON             \
  -DCMAKE_INSTALL_PREFIX=install      \
+  -DMACE_MICRO_ENABLE_EXAMPLES=OFF \
  -DHEXAGON_STUB=ON \
  $@ || exit 1

-cmake --build . -- -j || exit 1
+cmake --build . --target install --target install -- -j || exit 1

 cd ../../..
--- a/micro/tools/cmake/cmake-build-gcc-arm-none-eabi.sh
+++ b/micro/tools/cmake/cmake-build-gcc-arm-none-eabi.sh
 #!/bin/bash
-if [ -z "$GCC_ARM_ROOT" ]; then
-  echo "GCC_ARM_ROOT is undefined";
-fi

-BUILD_DIR=build/cmake-build/gcc-arm-none-eabi
+BUILD_DIR=build/micro/gcc-arm-none-eabi
+
 mkdir -p ${BUILD_DIR}
 cd ${BUILD_DIR}

-cmake ../../.. \
-  -DGCC_ARM_ROOT=${GCC_ARM_ROOT} \
+cmake ../../../micro \
  -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchain/gcc-arm-none-eabi.cmake \
+  -DMACE_MICRO_ENABLE_CMSIS=ON \
  -DCMAKE_INSTALL_PREFIX=install \
+  -DMACE_MICRO_ENABLE_TESTS=OFF \
  $@ || exit 1

-cmake --build . -- -j || exit 1
+cmake --build . --target install -- -j || exit 1

 cd ../../..
--- a/micro/tools/cmake/cmake-build-hexagon6.sh
+++ b/micro/tools/cmake/cmake-build-hexagon6.sh
@@ -10,17 +10,18 @@ if [ -z "$HEXAGON_SDK_ROOT" ]; then
  echo "HEXAGON_SDK_ROOT is undefined";
 fi

-BUILD_DIR=build/cmake-build/hexagon6
+BUILD_DIR=build/micro/hexagon6
 mkdir -p ${BUILD_DIR}
 cd ${BUILD_DIR}

-cmake ../../.. \
+cmake ../../../micro \
  -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} \
  -DHEXAGON_TOOLS=${HEXAGON_TOOLS} \
+  -DMACE_MICRO_ENABLE_EXAMPLES=OFF \
  -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchain/hexagon6.toolchain.cmake \
  -DCMAKE_INSTALL_PREFIX=install \
  $@ || exit 1

-cmake --build . -- -j || exit 1
+cmake --build . --target install -- -j || exit 1

 cd ../../..
--- a/micro/tools/cmake/cmake-build-hexagon8.sh
+++ b/micro/tools/cmake/cmake-build-hexagon8.sh
@@ -10,16 +10,18 @@ if [ -z "$HEXAGON_SDK_ROOT" ]; then
  echo "HEXAGON_SDK_ROOT is undefined";
 fi

-BUILD_DIR=build/cmake-build/hexagon8
-mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+BUILD_DIR=build/micro/hexagon8
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}

-cmake ../../.. \
+cmake ../../../micro \
  -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} \
  -DHEXAGON_TOOLS=${HEXAGON_TOOLS} \
+  -DMACE_MICRO_ENABLE_EXAMPLES=OFF \
  -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchain/hexagon8.toolchain.cmake \
  -DCMAKE_INSTALL_PREFIX=install \
  $@ || exit 1

-cmake --build . -- -j || exit 1
+cmake --build . --target install -- -j || exit 1

 cd ../../..
--- a/micro/tools/cmake/cmake-build-host.sh
+++ b/micro/tools/cmake/cmake-build-host.sh
 #!/bin/bash
-BUILD_DIR=build/cmake-build/host
+
+BUILD_DIR=build/micro/host
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}

-cmake ../../.. \
-  -DMACE_MICRO_ENABLE_TESTS=ON \
+cmake ../../../micro \
  -DCMAKE_INSTALL_PREFIX=install \
  $@ || exit 1

-cmake --build . -- -j || exit 1
+cmake --build . --target install -- -j || exit 1

 cd ../../..
--- a/tools/cmake/cmake-generate-proto-py-host.sh
+++ b/tools/cmake/cmake-generate-proto-py-host.sh
+#!/bin/bash
+
+if [[ -z "$BUILD_DIR" ]]; then
+    BUILD_DIR=build/cmake-build/host
+fi
+
+mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
+cmake ../../..
+make mace_proto_py micro_mem_proto_py -j
+cd ../../..
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -61,6 +61,7 @@ PlatformTypeStrs = [
    "caffe",
    "onnx",
    "megengine",
+    "keras"
 ]
 PlatformType = Enum('PlatformType', [(ele, ele) for ele in PlatformTypeStrs],
                    type=str)

--- a/tools/cpplint.sh
+++ b/tools/cpplint.sh
@@ -8,4 +8,12 @@ cpplint --linelength=80 --counting=detailed --root=test/ccutils $(find test/ccut
 cpplint --linelength=80 --counting=detailed --root=test/ccunit $(find test/ccunit -name "*.h" -or -name "*.cc")
 cpplint --linelength=80 --counting=detailed --root=test/ccbenchmark $(find test/ccbenchmark -name "*.h" -or -name "*.cc")

-cpplint --linelength=80 --counting=detailed $(find ./micro -path ./micro/codegen -prune -o -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed --filter=-build/include_what_you_use $(find micro/base  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/framework  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/include  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/model  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed --filter=-build/include_what_you_use $(find micro/ops  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/port  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/test \( -path micro/test/ccbenchmark/codegen -or -path micro/test/ccbaseline/codegen \) -prune -o  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed $(find micro/tools  -name "*.h" -or -name "*.cc")
+cpplint --linelength=80 --counting=detailed --filter=-build/include_subdir $(find micro/examples \( -path micro/examples/classifier/mbed-os -or -path micro/examples/classifier/data -or -path micro/examples/classifier/install -or -path micro/examples/classifier/BUILD \) -prune  -name "*.cc" -or -name "*.h")
--- a/tools/python/convert.py
+++ b/tools/python/convert.py
@@ -123,6 +123,8 @@ def convert_model(conf, quantize_stat):
        option.change_concat_ranges = conf[ModelKeys.change_concat_ranges]
    if ModelKeys.cl_mem_type in conf:
        option.cl_mem_type = conf[ModelKeys.cl_mem_type]
+    if ModelKeys.platform in conf:
+        option.platform = conf[ModelKeys.platform]
    if ModelKeys.runtime in conf:
        option.device = conf[ModelKeys.runtime]
        if option.device == DeviceType.CPU_GPU:
@@ -190,6 +192,10 @@ def convert_model(conf, quantize_stat):
        from transform import megengine_converter
        converter = megengine_converter.MegengineConverter(
            option, conf["model_file_path"])
+    elif platform == Platform.KERAS:
+        from transform import keras_converter
+        converter = keras_converter.KerasConverter(
+            option, conf["model_file_path"])
    else:
        mace_check(False, "Mace do not support platorm %s yet." % platform)


--- a/tools/python/micro/jinja2_files/micro_graph_data.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_graph_data.h.jinja2
@@ -20,7 +20,7 @@ namespace micro {
 namespace {{model_tag}} {

 uint8_t kGraphData[{{ data_size }}] = {
-  {% for d in embed_data %}{{"0x%02X, " % d }}{%endfor%}
+  {{ hex_bytes_string }}
 };

 }  // namespace {{model_tag}}

--- a/tools/python/micro/jinja2_files/micro_model_data.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_model_data.h.jinja2
--- a/tools/python/micro/jinja2_files/micro_net_def.h.jinja2
+++ b/tools/python/micro/jinja2_files/micro_net_def.h.jinja2
--- a/tools/python/micro/micro_codegen.py
+++ b/tools/python/micro/micro_codegen.py
--- a/tools/python/micro/micro_op_converter.py
+++ b/tools/python/micro/micro_op_converter.py
--- a/tools/python/micro/micro_support_ops.py
+++ b/tools/python/micro/micro_support_ops.py
--- a/tools/python/micro/scratch_computer.py
+++ b/tools/python/micro/scratch_computer.py
--- a/tools/python/micro_converter.py
+++ b/tools/python/micro_converter.py
--- a/tools/python/py_proto/__init__.py
+++ b/tools/python/py_proto/__init__.py
--- a/tools/python/quantize/quantize_util.py
+++ b/tools/python/quantize/quantize_util.py
--- a/tools/python/run_micro.py
+++ b/tools/python/run_micro.py
--- a/tools/python/run_micro_bazel.py
+++ b/tools/python/run_micro_bazel.py
--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
--- a/tools/python/transform/keras_converter.py
+++ b/tools/python/transform/keras_converter.py
--- a/tools/python/transform/transformer.py
+++ b/tools/python/transform/transformer.py
--- a/tools/python/utils/config_parser.py
+++ b/tools/python/utils/config_parser.py
--- a/tools/python/utils/convert_util.py
+++ b/tools/python/utils/convert_util.py