diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e3a22c5bd6ac679c3d38398dbe190d32cd81f59..f454edf8aa0f3c850eaf477d55d499abea69cc25 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ stages:
   - api_test
   - python_tools_tests
   - model_tests
+  - quantization_tests
   - build_android_demo
   - ops_benchmark
   - extra_tests
@@ -62,6 +63,14 @@ api_test:
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
 
 ops_benchmark:
   stage: ops_benchmark
@@ -103,7 +112,7 @@ ndk_versions_compatible_tests:
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
+      for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
       do
       new_ndk_path=${prefix_path}${ndk};
       if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
@@ -111,8 +120,12 @@ ndk_versions_compatible_tests:
         export PATH=$ANDROID_NDK_HOME:$PATH;
         echo "ndk path: $ANDROID_NDK_HOME";
         if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
       fi
       done
     - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
@@ -131,9 +144,9 @@ python_tools_tests:
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
 
 model_tests:
   stage: model_tests
@@ -142,23 +155,39 @@ model_tests:
     - rm -rf mace-models
     - rm -rf generic-mobile-devices
     - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
     - >
       if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
         GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
-      do
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      done
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - >
       python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+    - rm -rf mace-models
+
+quantization_tests:
+  stage: quantization_tests
+  script:
+    - pwd
+    - rm -rf mace-models
+    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
     - rm -rf mace-models
 
 build_android_demo:
diff --git a/docs/index.rst b/docs/index.rst
index f839a13f7cf8d04c39d63280306ee3fb8dff513b..7545f2aa8c3227a88fc1b1e4fdc1ea194186c474 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -27,6 +27,7 @@ The main documentation is organized into the following sections:
 
    user_guide/basic_usage
    user_guide/advanced_usage
+   user_guide/benchmark
    user_guide/op_lists
    user_guide/quantization_usage
 
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 93ebb4f8d1c9f66d9aa600c1b063f2a6b8d488da..8395c45b783588f047e51a9a0bedcae0a5a7bd11 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -379,6 +379,8 @@ Useful Commands
 
 * **benchmark and profile model**
 
+the detailed information is in :doc:`benchmark`.
+
 .. code:: sh
 
     # Benchmark model, get detailed statistics of each Op.
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index d4d404baf8d652f169fe029be2a4966880351dd6..6d59a68eced45173ecc8c5e448f20661d34e6ecf 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -227,7 +227,7 @@ to run and validate your model.
 
 * **benchmark**
 
-    benchmark and profile the model.
+    benchmark and profile the model. the details are in :doc:`benchmark`.
 
     .. code:: sh
 
diff --git a/docs/user_guide/benchmark.rst b/docs/user_guide/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a190a7cc4443a8c8e9147bcd91dd8f765c43268d
--- /dev/null
+++ b/docs/user_guide/benchmark.rst
@@ -0,0 +1,293 @@
+Benchmark usage
+===============
+
+This part contains the usage of MACE benchmark tools.
+
+Overview
+--------
+
+As mentioned in the previous part, there are two kinds of benchmark tools,
+one for operator and the other for model.
+
+Operator Benchmark
+------------------
+
+Operator Benchmark is used for test and optimize the performance of specific operator.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True  --args="--filter=.*BM_CONV.*"
+
+======
+Output
+======
+
+    .. code:: bash
+
+        Benchmark                                                    Time(ns) Iterations Input(MB/s)   GMACPS
+        ------------------------------------------------------------------------------------------------------
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU       1759129        479     114.09      29.21
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU       4031301        226      49.79      12.75
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU        3996357        266      25.11      12.86
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU      914994       1093      54.84      56.15
+
+
+===========
+Explanation
+===========
+
+.. list-table::
+    :header-rows: 1
+
+    * - Options
+      - Usage
+    * - Benchmark
+      - Benchmark unit name.
+    * - Time
+      - Time of one round.
+    * - Iterations
+      - the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s.
+    * - Input
+      - The bandwidth of dealing with input. the unit is MB/s.
+    * - GMACPS
+      - The speed of running MACs(multiply-accumulation). the unit is G/s.
+
+Model Benchmark
+---------------
+
+Model Benchmark is used for test and optimize the performance of your model.
+This tool could record the running time of the model and the detailed running information of each operator of your model.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
+
+======
+Output
+======
+
+    .. code:: bash
+
+        I benchmark_model.cc:158 ---------------------------------------------------------------------
+        I benchmark_model.cc:158                                Warm Up
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |   std |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 |     1 |    51.481 |   51.481 |  51.481 |  51.481 |  51.481 | 0.000 |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158                          Run without statistics
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |      std |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    30.272 |   31.390 |  29.938 |  45.966 |  30.913 | 1850.983 |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 -----------------------------------------------------------------------
+        I benchmark_model.cc:158                           Run with statistics
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    32.358 |   33.327 |  32.293 |  33.607 |  33.002 | 310.435 |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                                      Sort by Run Order
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type |  Start | First | Avg(ms) |     % |    cdf% | GMACPS | Stride |   Pad |    Filter Shape |   Output Shape | Dilation |                                               name |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |       Transpose |  0.000 | 0.102 |   0.100 | 0.315 |   0.315 |  0.000 |        |       |                 |  [1,3,224,224] |          |                                              input |
+        I statistics.cc:343 |          Conv2D |  0.107 | 1.541 |   1.570 | 4.943 |   5.258 |  6.904 |  [2,2] |  SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |             MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 | DepthwiseConv2d |  1.724 | 0.936 |   0.944 | 2.972 |   8.230 |  3.827 |  [1,1] |  SAME |      [1,32,3,3] | [1,32,112,112] |    [1,1] |   MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 |
+        I statistics.cc:343 |         Softmax | 32.835 | 0.039 |   0.042 | 0.131 |  99.996 |  0.000 |        |       |                 |       [1,1001] |          |                    MobilenetV1/Predictions/Softmax |
+        I statistics.cc:343 |        Identity | 32.880 | 0.001 |   0.001 | 0.004 | 100.000 |  0.000 |        |       |                 |       [1,1001] |          | mace_output_node_MobilenetV1/Predictions/Reshape_1 |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                              Sort by Computation Time
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 | Op Type |  Start | First | Avg(ms) |     % |   cdf% | GMACPS | Stride |  Pad |    Filter Shape |   Output Shape | Dilation |                                              name |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |  Conv2D | 30.093 | 2.102 |   2.198 | 6.922 |  6.922 | 23.372 |  [1,1] | SAME | [1024,1024,1,1] |   [1,1024,7,7] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  7.823 | 2.115 |   2.164 | 6.813 | 13.735 | 23.747 |  [1,1] | SAME |   [128,128,1,1] |  [1,128,56,56] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 15.859 | 2.119 |   2.109 | 6.642 | 20.377 | 24.358 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 23.619 | 2.087 |   2.096 | 6.599 | 26.976 | 24.517 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 26.204 | 2.081 |   2.093 | 6.590 | 33.567 | 24.549 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 21.038 | 2.036 |   2.091 | 6.585 | 40.152 | 24.569 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 18.465 | 2.034 |   2.082 | 6.554 | 46.706 | 24.684 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  2.709 | 1.984 |   2.058 | 6.482 | 53.188 | 12.480 |  [1,1] | SAME |     [64,32,1,1] | [1,64,112,112] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 12.220 | 1.788 |   1.901 | 5.986 | 59.174 | 27.027 |  [1,1] | SAME |   [256,256,1,1] |  [1,256,28,28] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  0.107 | 1.541 |   1.570 | 4.943 | 64.117 |  6.904 |  [2,2] | SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |            MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343                                        Stat by Op Type
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type | Count | Avg(ms) |      % |    cdf% |        MACs | GMACPS | Called times |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |          Conv2D |    15 |  24.978 | 78.693 |  78.693 | 551,355,392 | 22.074 |           15 |
+        I statistics.cc:343 | DepthwiseConv2d |    13 |   6.543 | 20.614 |  99.307 |  17,385,984 |  2.657 |           13 |
+        I statistics.cc:343 |       Transpose |     1 |   0.100 |  0.315 |  99.622 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Pooling |     1 |   0.072 |  0.227 |  99.849 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Softmax |     1 |   0.041 |  0.129 |  99.978 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Squeeze |     1 |   0.006 |  0.019 |  99.997 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |        Identity |     1 |   0.001 |  0.003 | 100.000 |           0 |  0.000 |            1 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343           Stat by MACs(Multiply-Accumulation)
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 |       total | round | first(G/s) | avg(G/s) |     std |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 | 568,741,376 |   100 |     18.330 |   17.909 | 301.326 |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343                           Summary of Ops' Stat
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 |   100 |    31.028 |   32.093 |  31.028 |  32.346 |  31.758 | 301.326 |
+        I statistics.cc:343 ------------------------------------------------------------------------
+
+
+===========
+Explanation
+===========
+
+There are 8 sections of the output information.
+
+1. **Warm Up**
+
+This section lists the time information of warm-up run.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - round
+      - the number of round has been run.
+    * - first
+      - the run time of first round. unit is millisecond.
+    * - curr
+      - the run time of last round. unit is millisecond.
+    * - min
+      - the minimal run time of all rounds. unit is millisecond.
+    * - max
+      - the maximal run time of all rounds. unit is millisecond.
+    * - avg
+      - the average run time of all rounds. unit is millisecond.
+    * - std
+      - the standard deviation of all rounds.
+
+2. **Run without statistics**
+
+This section lists the run time information without statistics code.
+ the detailed explanation is the same as the section of Warm Up.
+
+3. **Run with statistics**
+
+This section lists the run time information with statistics code,
+ the time maybe longer compared with the second section.
+ the detailed explanation is the same as the section of Warm Up.
+
+4. **Sort by Run Order**
+
+This section lists the detailed run information of every operator in your model.
+The operators is listed based on the run order, Every line is an operator of your model.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - Op Type
+      - the type of operator.
+    * - Start
+      - the start time of the operator. unit is millisecond.
+    * - First
+      - the run time of first round. unit is millisecond.
+    * - Avg
+      - the average run time of all rounds. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - GMACPS
+      - The number of run MACs(multiply-accumulation) per second. the unit is G/s.
+    * - Stride
+      - the stride parameter of the operator if exist.
+    * - Pad
+      - the pad parameter of the operator if exist.
+    * - Filter Shape
+      - the filter shape of the operator if exist.
+    * - Output Shape
+      - the output shape of the operator.
+    * - Dilation
+      - the dilation parameter of the operator if exist.
+    * - Name
+      - the name of the operator.
+
+5. **Sort by Computation time**
+
+This section lists the top-10 most time-consuming operators.
+The operators is listed based on the computation time,
+the detailed explanation is the same as previous section.
+
+6. **Stat by Op Type**
+
+This section stats the run information about operators based on operator type.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Op Type
+      - the type of operator.
+    * - Count
+      - the number of operators with the type.
+    * - Avg
+      - the average run time of the operator. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - MACs
+      - The number of MACs(multiply-accumulation).
+    * - GMACPS
+      - The number of MACs(multiply-accumulation) runs per second. the unit is G/s.
+    * - Called times
+      - the number of called times in all rounds.
+
+7. **Stat by MACs**
+
+This section stats the MACs information of your model.
+
+.. list-table::
+    :header-rows: 1
+
+    * - total
+      - the number of MACs of your model.
+    * - round
+      - the number of round has been run.
+    * - First
+      - the GMAPS of first round. unit is G/s.
+    * - Avg
+      - the average GMAPS of all rounds. unit is G/s.
+    * - std
+      - the standard deviation of all rounds.
+
+8. **Summary of Ops' Stat**
+
+This section lists the run time information which is summation of every operator's run time.
+which may be shorter than the model's run time with statistics.
+the detailed explanation is the same as the section of Warm Up.
diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD
index b086ad24479e74a31e429fc53454ebc38021bdc3..3fe9a006b942f79274bdaf72f6e75845cde59d0c 100644
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -15,6 +15,7 @@ cc_library(
     srcs = ["statistics.cc"],
     hdrs = ["statistics.h"],
     copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+    visibility = ["//visibility:public"],
     deps = [
         "//mace/utils",
     ],
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 7f0afe2405c2bd6f07545a34f7b5deaa17ebd145..bcb9ae752602e08bbf9cec48ef7934ccde1dcef0 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) {
   return result;
 }
 
-bool SplitAndParseToInts(const std::string &str,
-                         char delims,
-                         std::vector<int64_t> *result) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    int64_t dim = atoi(tmp.data());
-    result->push_back(dim);
-    size_t next_offset = tmp.find(delims);
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return true;
-}
-
 }  //  namespace str_util
 
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
diff --git a/mace/benchmark/statistics.cc b/mace/benchmark/statistics.cc
index 0f05798c9881660ee59e19600bd045b315472d37..7329c247854679f3dbc12620e75f0b7c02503a54 100644
--- a/mace/benchmark/statistics.cc
+++ b/mace/benchmark/statistics.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <functional>
 #include <set>
 
 #include "mace/benchmark/statistics.h"
@@ -53,7 +54,6 @@ std::string ShapeToString(
   if (output_shape.empty()) {
     return "";
   }
-
   std::stringstream stream;
   stream << "[";
   for (size_t i = 0; i < output_shape.size(); ++i) {
@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) {
 
 }  // namespace
 
+
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape) {
+  int64_t macs = 0;
+  if (op_type == "Conv2D" || op_type == "Deconv2D") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3]
+        * filter_shape[2] * filter_shape[3] * filter_shape[1];
+  } else if (op_type == "MatMul") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>())
+        * filter_shape.back();
+  } else if (op_type == "DepthwiseConv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "DepthwiseDeconv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "FullyConnected") {
+    macs = output_shape[0] * std::accumulate(filter_shape.begin(),
+                                             filter_shape.end(),
+                                             1,
+                                             std::multiplies<int64_t>());
+  } else if (op_type == "BatchNorm") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>());
+  } else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") {
+    macs = 3 * std::accumulate(output_shape.begin(),
+                               output_shape.end(),
+                               1,
+                               std::multiplies<int64_t>());
+  }
+  return macs;
+}
+
 void OpStat::StatMetadata(const RunMetadata &meta_data) {
   if (meta_data.op_stats.empty()) {
     LOG(FATAL) << "Op metadata should not be empty";
@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) {
       record->type = op_stat.type;
       record->args = op_stat.args;
       record->output_shape = op_stat.output_shape;
+      record->macs =
+          StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]);
       record->order = order_idx;
       order_idx += 1;
     }
@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric,
   // generate string
   std::string title = "Sort by " + MetricToString(metric);
   const std::vector<std::string> header = {
-      "Node Type", "Start", "First", "Avg(ms)", "%", "cdf%",
+      "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS",
       "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name"
   };
   std::vector<std::vector<std::string>> data;
@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric,
         FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3));
     tuple.push_back(
         FloatToString(accumulate_time * 100.f / total_time_.sum(), 3));
+    tuple.push_back(FloatToString(
+        record.macs < 1e-6 ? record.macs :
+        (record.macs * 1e-3) / record.rel_end.avg(), 3));
     tuple.push_back(VectorToString<int>(record.args.strides));
     if (record.args.padding_type != -1) {
       tuple.push_back(PaddingTypeToString(record.args.padding_type));
@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric,
   return mace::string_util::StringFormatter::Table(title, header, data);
 }
 
-std::string OpStat::StatByNodeType() const {
+std::string OpStat::StatByOpType() const {
   if (records_.empty()) {
     return "";
   }
   const int64_t round = total_time_.round();
   int64_t total_time = 0;
   std::map<std::string, int64_t> type_time_map;
+  std::map<std::string, int64_t> type_macs_map;
   std::map<std::string, int64_t> type_count_map;
   std::map<std::string, int64_t> type_called_times_map;
-  std::set<std::string> node_types_set;
+  std::set<std::string> op_types_set;
   for (auto &record : records_) {
-    std::string node_type = record.second.type;
-    node_types_set.insert(node_type);
+    std::string op_type = record.second.type;
+    op_types_set.insert(op_type);
 
-    type_time_map[node_type] += record.second.rel_end.sum() / round;
+    type_time_map[op_type] += record.second.rel_end.sum() / round;
+    type_macs_map[op_type] += record.second.macs;
     total_time += record.second.rel_end.sum() / round;
-    type_count_map[node_type] += 1;
-    type_called_times_map[node_type] += record.second.called_times / round;
+    type_count_map[op_type] += 1;
+    type_called_times_map[op_type] += record.second.called_times / round;
   }
-  std::vector<std::string> node_types(node_types_set.begin(),
-                                      node_types_set.end());
-  std::sort(node_types.begin(), node_types.end(),
+  std::vector<std::string> op_types(op_types_set.begin(),
+                                    op_types_set.end());
+  std::sort(op_types.begin(), op_types.end(),
             [&](const std::string &lhs, const std::string &rhs) {
               return type_time_map[lhs] > type_time_map[rhs];
             });
 
-  std::string title = "Stat by node type";
+  std::string title = "Stat by Op Type";
   const std::vector<std::string> header = {
-      "Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times"
+      "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs",
+      "GMACPS", "Called times"
   };
 
   float cdf = 0.0f;
   std::vector<std::vector<std::string>> data;
-  for (auto type : node_types) {
+  for (auto type : op_types) {
     const float avg_time = type_time_map[type] / 1000.0f;
     const float percentage = type_time_map[type] * 100.0f / total_time;
     cdf += percentage;
@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const {
     tuple.push_back(FloatToString(avg_time, 3));
     tuple.push_back(FloatToString(percentage, 3));
     tuple.push_back(FloatToString(cdf, 3));
+    tuple.push_back(IntToString(type_macs_map[type]));
+    tuple.push_back(FloatToString(
+        type_macs_map[type] < 1e-6 ? type_macs_map[type] :
+        (type_macs_map[type] * 1e-3) / type_time_map[type], 3));
     tuple.push_back(IntToString(type_called_times_map[type]));
     data.emplace_back(tuple);
   }
   return mace::string_util::StringFormatter::Table(title, header, data);
 }
 
+
+std::string OpStat::StatByMACs() const {
+  if (records_.empty()) {
+    return "";
+  }
+  const int64_t round = total_time_.round();
+  int64_t count = 0;
+  for (auto &record : records_) {
+    count += record.second.macs;
+  }
+
+  std::string title = "Stat by MACs(Multiply-Accumulation)";
+  const std::vector<std::string> header = {
+      "total", "round", "first(G/s)", "avg(G/s)", "std"
+  };
+
+  std::vector<std::vector<std::string>> data;
+  std::vector<std::string> tuple;
+  tuple.push_back(IntToString(count));
+  tuple.push_back(IntToString(round));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3));
+  tuple.push_back(FloatToString(total_time_.std_deviation(), 3));
+  data.emplace_back(tuple);
+  return mace::string_util::StringFormatter::Table(title, header, data);
+}
+
 std::string OpStat::Summary() const {
   std::stringstream stream;
   if (!records_.empty()) {
@@ -252,9 +331,11 @@ void OpStat::PrintStat() const {
     stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl;
     // top-10 op stat by time
     stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl;
-    // op stat by node type
-    stream << StatByNodeType() << std::endl;
+    // op stat by op type
+    stream << StatByOpType() << std::endl;
   }
+  // print MACs statistics
+  stream << StatByMACs();
   // Print summary
   stream << Summary();
 
diff --git a/mace/benchmark/statistics.h b/mace/benchmark/statistics.h
index 52f963e5331ae095d88f79bbbb8db756fa02d954..f0cf2be69dbba660a0326665a6a3b6b282eef4b7 100644
--- a/mace/benchmark/statistics.h
+++ b/mace/benchmark/statistics.h
@@ -19,6 +19,7 @@
 #include <cmath>
 #include <iomanip>
 #include <limits>
+#include <locale>
 #include <map>
 #include <sstream>
 #include <string>
@@ -33,11 +34,33 @@ class RunMetadata;
 
 namespace benchmark {
 
+// stat the number of multiply-accumulate(MAC)
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape);
+
 template <typename IntType>
 std::string IntToString(const IntType v) {
   std::stringstream stream;
   stream << v;
-  return stream.str();
+  std::string src_str = stream.str();
+  size_t size = src_str.size();
+  size_t dst_size = size + ((size-1) / 3);
+  if (src_str[0] == '-') {
+    dst_size = size + ((size-2) / 3);
+  }
+  std::string result(dst_size, ',');
+  size_t dst_idx = dst_size - 1;
+  for (size_t src_idx = 0; src_idx < size; ++src_idx) {
+    if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) {
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    } else {
+      dst_idx -= 1;
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    }
+    dst_idx -= 1;
+  }
+  return result;
 }
 
 template <typename FloatType>
@@ -127,7 +150,7 @@ enum Metric {
   COMPUTATION_TIME,
 };
 
-class OpStat{
+class OpStat {
  public:
   void StatMetadata(const RunMetadata &meta_data);
 
@@ -136,7 +159,8 @@ class OpStat{
  private:
   std::string StatByMetric(const Metric metric,
       const int top_limit) const;
-  std::string StatByNodeType() const;
+  std::string StatByOpType() const;
+  std::string StatByMACs() const;
   std::string Summary() const;
 
  private:
@@ -145,6 +169,7 @@ class OpStat{
     std::string type;
     std::vector<std::vector<int64_t>> output_shape;
     ConvPoolArgs args;
+    int64_t macs;
     int64_t order;
     TimeInfo<int64_t> start;
     TimeInfo<int64_t> rel_end;
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 1732cfe1a36f04b9fed6c378e67b4637554113ae..7912a6d4209808c25b7b33b47806f3eedf81112b 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
       std::string type = op->debug_def().type();
 
       if (type.compare("Conv2D") == 0 ||
-          type.compare("FusedConv2D") == 0 ||
+          type.compare("Deconv2D") == 0 ||
           type.compare("DepthwiseConv2d") == 0 ||
+          type.compare("DepthwiseDeconv2d") == 0 ||
           type.compare("Pooling") == 0) {
         strides = op->GetRepeatedArgs<int>("strides");
         padding_type = op->GetOptionalArg<int>("padding", -1);
@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
         } else {
           kernels = op->Input(1)->shape();
         }
+      } else if (type.compare("MatMul") == 0) {
+        bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
+        kernels = op->Input(0)->shape();
+        if (transpose_a) {
+          std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
+        }
+      } else if (type.compare("FullyConnected") == 0) {
+        kernels = op->Input(1)->shape();
       }
 
       std::vector<std::vector<int64_t>> output_shapes;
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 5da1750988604683504ad9ccc60af3c5ff8b8fbf..57be33c2a686451a7fb9bccb7e8ce86f13bdfa3e 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -28,7 +28,7 @@ namespace testing {
 
 static std::vector<Benchmark *> *all_benchmarks = nullptr;
 static int64_t bytes_processed;
-static int64_t macc_processed;
+static int64_t macs_processed = 0;
 static int64_t accum_time = 0;
 static int64_t start_time = 0;
 
@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) {
   // Internal perf regression tools depends on the output formatting,
   // please keep in consistent when modifying
   printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
-         "Iterations", "Input(MB/s)", "MACC(G/s)");
-  printf("%s\n", std::string(width + 44, '-').c_str());
+         "Iterations", "Input(MB/s)", "GMACPS");
+  printf("%s\n", std::string(width + 45, '-').c_str());
   for (auto b : *all_benchmarks) {
     if (!std::regex_match(b->name_, match, regex)) continue;
     int iters;
@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) {
     b->Run(&iters, &seconds);
     float mbps = (bytes_processed * 1e-6) / seconds;
     // MACCs or other computations
-    float gmaccs = (macc_processed * 1e-9) / seconds;
+    float gmacs = (macs_processed * 1e-9) / seconds;
     printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(),
-           seconds * 1e9 / iters, iters, mbps, gmaccs);
+           seconds * 1e9 / iters, iters, mbps, gmacs);
   }
 }
 
@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
   int64_t iters = kMinIters;
   while (true) {
     bytes_processed = -1;
-    macc_processed = -1;
+    macs_processed = 0;
     RestartTiming();
     (*benchmark_func_)(iters);
     StopTiming();
@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
 }
 
 void BytesProcessed(int64_t n) { bytes_processed = n; }
-void MaccProcessed(int64_t n) { macc_processed = n; }
+void MacsProcessed(int64_t n) { macs_processed = n; }
 void RestartTiming() {
   accum_time = 0;
   start_time = NowMicros();
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index b6c070c71caf69dd4e4bad7eddd76d41808da9ce..2eb91e4024ca21cf8e4b24aa26fe523776286589 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,7 +42,7 @@ class Benchmark {
 };
 
 void BytesProcessed(int64_t);
-void MaccProcessed(int64_t);
+void MacsProcessed(int64_t);
 void RestartTiming();
 void StartTiming();
 void StopTiming();
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 1d8c821d9b7c7da10be25b705cb5865376f345fb..f6e01a74ef7d9d3ac3ef647646a8bf3df85d8667 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -230,6 +230,7 @@ cc_test(
     linkstatic = 1,
     deps = [
         "test",
+        "//mace/benchmark:statistics",
         "//mace/core:test_benchmark_main",
         "//third_party/eigen3",
     ],
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 76447e9b6134229a002ac94bb09f58b2f857d038..6faf62cebc221b0e9b37ba765c1d381432db44c0 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                            \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                            \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
@@ -292,7 +288,6 @@ void SigmoidBenchmark(
   static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
   }                                                                       \
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index f5e11740d79597bc02e9f2fba3c55a6e286b8a7c..b9751557b830a3c621ad9d62147011a3897dcbec 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
       MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(      \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                   \
   }                                                                           \
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index d3467e769f32a69732b366e2d077f5fb6c8959e8..a6afcb077aef3ce9a296bf16b00355ec8a98d268 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -75,7 +75,7 @@ void BatchNorm(
   static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                          \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;        \
-    mace::testing::MaccProcessed(tot);                                      \
+    mace::testing::MacsProcessed(tot);                                      \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
     BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                         \
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 9664a917e6256687a7c0bba75a3c5cb52732071e..64264936d65cc097ac47027de448b1f10dde17f4 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -58,7 +58,6 @@ void BMBatchToSpace(
       MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                      \
   }                                                                            \
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 9026ffb2b2142b4b7d9d99c303401fc759ca0e05..f0604d56446bced04d9c21a017d74c8e8448f9e6 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                       \
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index f5f1df413258fc1a1a66729b7af7d39604281039..4ba0f64c1ce2354f3e8b133664303dff59896a07 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters,
   static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W);            \
   }                                                                  \
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index db5f8494af4d2f0bfceb1288d250572d1e15a830..8ea6d139a30efd2389c003daac152ac36d3e6b15 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -61,7 +61,6 @@ void ChannelShuffle(
       MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G);                        \
   }                                                                            \
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index a43fc3084f880754612e50d75753d353d09dd04f..eaff9b44256941ef4389610b994ead52f78319f1 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) {
     net.Run();
   }
   const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2;
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters,
   const int64_t tot =
       static_cast<int64_t>(iters) *
       (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 91efff7974df9e159f531fb4fcd104751e5ed0f4..a0e780032b541d7cd54ab10dbcef8bfee35b7782 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters,
         (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;        \
     int64_t ow =                                                              \
         (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;        \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC});                      \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION,         \
                          mace::Padding::P, OC);                               \
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index aad6f93d610e8ac6eed96bd0aef9bcbcbf27cdca..5133a28abd338dd463c3cda52228457ebdc101d3 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) {
     net.RunOp(D);
   }
   const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters,
   const int64_t tot =
       static_cast<int64_t>(iters) *
       (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index 81be17c092ad0d6e91bbdf0514a4c0d94e641b10..9a2c405dfd6bb287c778f74c03fa1375472924af 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -90,9 +91,10 @@ static void Deconv2d(int iters,
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
     int64_t oh = OH;                                                          \
     int64_t ow = OW;                                                          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC});                    \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW,         \
                          mace::Padding::P, OC);                               \
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index c9c6dd4016b97869289388ecbfbe200347846269..1283e432a7cde4e57929ebd470732f7bb5bed088 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -62,7 +62,6 @@ void DepthToSpace(
       MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
   }                                                                           \
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 4d44a9bc136b59fc5e29dd93343638f65b58db88..c5aee849f171e82ff1190ac18140cdc300e8c059 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters,
         (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;         \
     int64_t ow =                                                               \
         (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;         \
-    const int64_t macc =                                                       \
-        static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    const int64_t macs =                                                       \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(               \
+            "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C});                \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,           \
                                   mace::Padding::P, M);                        \
diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc
index 081e10d27ce6748d397f635d53b9f74673a15c20..a130ca1d3fd5b58a0e9a89b770061f1f84575315 100644
--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters,
         ##_##TYPE##_##DEVICE(                                                 \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * H * W * KH * KW * C;   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C});               \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
-    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);        \
+    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);         \
   }                                                                           \
   MACE_BENCHMARK(                                                             \
     MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 95808bc336a46231d920a7c409e846b89725e2ed..b75149bd3bd49e5b83335753eb9c9d5b18d07be2 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -66,7 +66,6 @@ void EltwiseBenchmark(
       MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     EltwiseBenchmark<DEVICE, TYPE>(                                           \
         iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C);      \
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index bb27c97dcdf2197c6f1e60ef59589b4d7a39b429..bb6dcd80eca7c9a2850d96e8d3cc7915267c7e8d 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>(
 #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
   static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
       int iters) {                                                         \
-    const int64_t macc =                                                   \
-        static_cast<int64_t>(iters) * N * C * H * W * OC + OC;             \
+    const int64_t macs =                                                   \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(           \
+            "FullyConnected", {OC, H, W, C}, {N, 1, 1, OC});               \
     const int64_t tot =                                                    \
         static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC;           \
-    mace::testing::MaccProcessed(macc);                                    \
+    mace::testing::MacsProcessed(macs);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
     FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);                      \
   }                                                                        \
diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc
index 5e52875c2074b3b4c23f5e2dab5ebe7a2119e7d9..7fe4a0fb568742b8391245c8a82c135cd78e48a2 100644
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
@@ -66,7 +66,6 @@ void GatherBenchmark(int iters,
       MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \
           int iters) {                                                    \
     const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED;    \
-    mace::testing::MaccProcessed(0);                                      \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED);             \
   }                                                                       \
diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc
index b917c495ffd574df459fd6881f276a9c6e09782f..61207af0c42a11c676f28d2a506304ab70a1458d 100644
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -59,7 +59,6 @@ static void LocalResponseNorm(
       MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(   \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                            \
diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc
index 6568025a1a169ed856cf3df8704f635bb9824b2b..a3b9609490d4e9965cb89ee0bde45badd8cee870 100644
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
   static void                                                                  \
       MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\
         int iters) {                                                           \
-    const int64_t macc =                                                       \
+    const int64_t macs =                                                       \
         static_cast<int64_t>(                                                  \
             iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS;       \
     const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE;          \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot * (sizeof(TYPE)));                       \
     LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS);                \
   }                                                                            \
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index f118e63f4680b68f0f77bc55697cf318f729caaa..1996587ad47ee60aa524b5e48118a45cca8e4a64 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "public/gemmlowp.h"
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/gemm.h"
 #include "mace/ops/sgemm.h"
@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
 
 #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
   static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
+    const int64_t macs = static_cast<int64_t>(iters) *             \
+        mace::benchmark::StatMACs("MatMul", {K}, {M, N});          \
     const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
-    mace::testing::MaccProcessed(macc);                            \
+    mace::testing::MacsProcessed(macs);                            \
     mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
     MatmulBenchmark_##FUNC(iters, M, K, N);                        \
   }                                                                \
@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                         \
   static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                          \
   }                                                                            \
@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE)               \
   static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(    \
       int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                 \
   }                                                                            \
diff --git a/mace/ops/memory_benchmark.cc b/mace/ops/memory_benchmark.cc
index e3bb30a81f5880d663257d2aafbaab277dee4e9d..73f3bdeb46ce8d1ee2d3013ff5c23aa7e15ab319 100644
--- a/mace/ops/memory_benchmark.cc
+++ b/mace/ops/memory_benchmark.cc
@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW(
   static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot * sizeof(float));              \
     MemoryAccessBenchmark_##ORDER(iters, N, H, W, C);                \
   }                                                                  \
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index fb7f4e14426677b1ee26bf0ba3459ea5043074ea..0125b4f5b4b1c44e462f077ba6a9d17165764ab3 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height,
   static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD);                               \
   }                                                                          \
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index c48cc8771fec57898dfe648abc7db7438bd5e330..880c0cad5462c78dfa4ce0f50816ffb6dbe0d002 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -81,7 +81,6 @@ void Pooling(int iters,
         ##TYPE##_##DEVICE(                                                     \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,          \
                     PoolingType::PO);                                          \
diff --git a/mace/ops/quantize_benchmark.cc b/mace/ops/quantize_benchmark.cc
index 62a534b721894360b922270fe03833be60ad582a..0c1493b80450a586cde90d80285ad57629cdc276 100644
--- a/mace/ops/quantize_benchmark.cc
+++ b/mace/ops/quantize_benchmark.cc
@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) {
     MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE(              \
       int iters) {                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
     Quantize<DEVICE, TYPE>(iters, N);                      \
   }                                                        \
@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) {
     MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE(            \
       int iters) {                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
     Dequantize<DEVICE, TYPE>(iters, N);                    \
   }                                                        \
diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc
index ec8807b0488892de1ac22eb5136dc5b524482c27..663c3b45d7a3aa3eb585e0ef14c31c1b093933dc 100644
--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels,
     MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     Reduce<DEVICE, TYPE>(iters, N, C, H, W);        \
   }                                                                  \
diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc
index 5ababebaa29676f289c368222bde120acf9c0aca..85e073fd1fba1de4c1e53da9cee19c3b8d964ecc 100644
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters,
       MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
         ##DEVICE(                                                             \
           int iters) {                                                        \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;   \
+    const int64_t macs = static_cast<int64_t>(iters) *                        \
+        mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C});       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;        \
-    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
   }                                                                           \
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index bace4f10374d681df889e6fd5451c37abc2d646c..ddc0f508b0d0e677fac6abd8bb7a61d79087b4e3 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters,
       MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
         ##DEVICE(                                                              \
           int iters) {                                                         \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;    \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C});       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;         \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
   }                                                                            \
diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc
index 9630f696011b5a04e1ee4ed18e03e19de9b1e333..9b7a915a58a3aaed3988889ef2cd80e855a4423a 100644
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) {
 #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE)                   \
   static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
-    const int64_t macc =                                                  \
+    const int64_t macs =                                                  \
         static_cast<int64_t>(iters) * N * C * H * W;                      \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(macc);                                   \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     Reverse<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                       \
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 25095da54f94324afd34274f79b09c59c1b4e3a7..819544b289b17547cb0d4443f4408dc2ad60d91f 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>(
   static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
   }                                                                       \
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index cacadfcd9673019a9c3f7938d72ebc3d45608c96..168461de213d9709dca3f1f9cfb6d3d1fff4f13c 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -64,7 +64,6 @@ void BMSpaceToBatch(
     MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
         int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                    \
   }                                                                            \
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index 3311d6186272cee46cc53f8e6d9426e9eb962295..6bd7755e0c9da1b2503cbf66090f99d361d2fd99 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -62,7 +62,6 @@ void SpaceToDepth(
       MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
   }                                                                           \
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index b21da8f5c7f055437a6a59952c3bea4957636efd..020c32142ce6ffdd743b9c7e4b054062811afa9d 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -65,7 +65,7 @@ void BMSplitHelper(int iters,
       MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(        \
           int iters) {                                                       \
         const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-        mace::testing::MaccProcessed(tot);                                   \
+        mace::testing::MacsProcessed(tot);                                   \
         mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
         BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO);                \
       }                                                                      \
diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc
index 353d8e7addfa4748fb7a160710bea226d3c569ab..1d2a7aa377d7fc8fb9b5c8eeb987b7a20e4ba40b 100644
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels,
     MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W);        \
   }                                                                  \
diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc
index f584239a6d3277c934f93eb356b11919e877993e..372f2f9d08917820e3c88b435e01d786715ab050 100644
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters,
   static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE(     \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * H * W;         \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0});         \
   }                                                                  \
@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters,
     MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
       DEVICE(int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3});  \
   }                                                                           \
diff --git a/mace/test/BUILD b/mace/test/BUILD
index 63faecfe7cb782145054f0abe55564e8ac7ab0f8..283dd486ff812df5d7b729c67a5f1a449a751513 100644
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
@@ -7,10 +7,10 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
-    "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
     "if_neon_enabled",
+    "if_openmp_enabled",
+    "if_android_armv7",
+    "if_hexagon_enabled",
     "if_opencl_enabled",
     "if_quantize_enabled",
 )
@@ -32,16 +32,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -62,16 +65,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -92,16 +98,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 71f3cc148036dfe0221b07757ad55c1ff09e7536..96ad13a475e681ad9cce2cf78b0198d679bb34d0 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
         if len(parts) == 5 and parts[0].startswith("BM_"):
             metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
             metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
-            metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
+            metrics["%s.gmac_per_sec" % parts[0]] = parts[4]
 
     # platform = dev[YAMLKeyword.target_socs]
     # model = dev[YAMLKeyword.device_name]