From 5f021151195a737100d2bcceb21e538cefe4a9e5 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Wed, 12 Dec 2018 13:05:57 +0800
Subject: [PATCH] Add MACs statistics for model benchmark tool and related docs

---
 .gitlab-ci.yml                            |  57 +++--
 docs/index.rst                            |   1 +
 docs/user_guide/advanced_usage.rst        |   2 +
 docs/user_guide/basic_usage.rst           |   2 +-
 docs/user_guide/benchmark.rst             | 293 ++++++++++++++++++++++
 mace/benchmark/BUILD                      |   1 +
 mace/benchmark/benchmark_model.cc         |  17 --
 mace/benchmark/statistics.cc              | 115 +++++++--
 mace/benchmark/statistics.h               |  31 ++-
 mace/core/net.cc                          |  11 +-
 mace/core/testing/test_benchmark.cc       |  14 +-
 mace/core/testing/test_benchmark.h        |   2 +-
 mace/ops/BUILD                            |   1 +
 mace/ops/activation_benchmark.cc          |   5 -
 mace/ops/addn_benchmark.cc                |   1 -
 mace/ops/batch_norm_benchmark.cc          |   2 +-
 mace/ops/batch_to_space_benchmark.cc      |   1 -
 mace/ops/bias_add_benchmark.cc            |   1 -
 mace/ops/buffer_to_image_benchmark.cc     |   1 -
 mace/ops/channel_shuffle_benchmark.cc     |   1 -
 mace/ops/concat_benchmark.cc              |   2 -
 mace/ops/conv_2d_benchmark.cc             |   8 +-
 mace/ops/crop_benchmark.cc                |   2 -
 mace/ops/deconv_2d_benchmark.cc           |   8 +-
 mace/ops/depth_to_space_benchmark.cc      |   1 -
 mace/ops/depthwise_conv2d_benchmark.cc    |   8 +-
 mace/ops/depthwise_deconv2d_benchmark.cc  |  10 +-
 mace/ops/eltwise_benchmark.cc             |   1 -
 mace/ops/fully_connected_benchmark.cc     |   8 +-
 mace/ops/gather_benchmark.cc              |   1 -
 mace/ops/local_response_norm_benchmark.cc |   1 -
 mace/ops/lstmcell_benchmark.cc            |   4 +-
 mace/ops/matmul_benchmark.cc              |  16 +-
 mace/ops/memory_benchmark.cc              |   1 -
 mace/ops/pad_benchmark.cc                 |   1 -
 mace/ops/pooling_benchmark.cc             |   1 -
 mace/ops/quantize_benchmark.cc            |   2 -
 mace/ops/reduce_benchmark.cc              |   1 -
 mace/ops/resize_bicubic_benchmark.cc      |   7 +-
 mace/ops/resize_bilinear_benchmark.cc     |   7 +-
 mace/ops/reverse_benchmark.cc             |   3 +-
 mace/ops/softmax_benchmark.cc             |   1 -
 mace/ops/space_to_batch_benchmark.cc      |   1 -
 mace/ops/space_to_depth_benchmark.cc      |   1 -
 mace/ops/split_benchmark.cc               |   2 +-
 mace/ops/sqrdiff_mean_benchmark.cc        |   1 -
 mace/ops/transpose_benchmark.cc           |   2 -
 mace/test/BUILD                           |  39 +--
 tools/bazel_adb_run.py                    |   2 +-
 49 files changed, 562 insertions(+), 139 deletions(-)
 create mode 100644 docs/user_guide/benchmark.rst

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e3a22c5..f454edf8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ stages:
   - api_test
   - python_tools_tests
   - model_tests
+  - quantization_tests
   - build_android_demo
   - ops_benchmark
   - extra_tests
@@ -62,6 +63,14 @@ api_test:
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
     - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
 
 ops_benchmark:
   stage: ops_benchmark
@@ -103,7 +112,7 @@ ndk_versions_compatible_tests:
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
+      for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
       do
       new_ndk_path=${prefix_path}${ndk};
       if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
@@ -111,8 +120,12 @@ ndk_versions_compatible_tests:
         export PATH=$ANDROID_NDK_HOME:$PATH;
         echo "ndk path: $ANDROID_NDK_HOME";
         if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
       fi
       done
     - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
@@ -131,9 +144,9 @@ python_tools_tests:
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
 
 model_tests:
   stage: model_tests
@@ -142,23 +155,39 @@ model_tests:
     - rm -rf mace-models
     - rm -rf generic-mobile-devices
     - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
     - >
       if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
         GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
         DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
       fi
     - >
-      for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
-      do
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      done
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - >
       python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+    - rm -rf mace-models
+
+quantization_tests:
+  stage: quantization_tests
+  script:
+    - pwd
+    - rm -rf mace-models
+    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
     - rm -rf mace-models
 
 build_android_demo:
diff --git a/docs/index.rst b/docs/index.rst
index f839a13f..7545f2aa 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -27,6 +27,7 @@ The main documentation is organized into the following sections:
 
    user_guide/basic_usage
    user_guide/advanced_usage
+   user_guide/benchmark
    user_guide/op_lists
    user_guide/quantization_usage
 
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 93ebb4f8..8395c45b 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -379,6 +379,8 @@ Useful Commands
 
 * **benchmark and profile model**
 
+the detailed information is in :doc:`benchmark`.
+
 .. code:: sh
 
     # Benchmark model, get detailed statistics of each Op.
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index d4d404ba..6d59a68e 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -227,7 +227,7 @@ to run and validate your model.
 
 * **benchmark**
 
-    benchmark and profile the model.
+    benchmark and profile the model. the details are in :doc:`benchmark`.
 
     .. code:: sh
 
diff --git a/docs/user_guide/benchmark.rst b/docs/user_guide/benchmark.rst
new file mode 100644
index 00000000..a190a7cc
--- /dev/null
+++ b/docs/user_guide/benchmark.rst
@@ -0,0 +1,293 @@
+Benchmark usage
+===============
+
+This part contains the usage of MACE benchmark tools.
+
+Overview
+--------
+
+As mentioned in the previous part, there are two kinds of benchmark tools,
+one for operator and the other for model.
+
+Operator Benchmark
+------------------
+
+Operator Benchmark is used for test and optimize the performance of specific operator.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True  --args="--filter=.*BM_CONV.*"
+
+======
+Output
+======
+
+    .. code:: bash
+
+        Benchmark                                                    Time(ns) Iterations Input(MB/s)   GMACPS
+        ------------------------------------------------------------------------------------------------------
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU       1759129        479     114.09      29.21
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU       4031301        226      49.79      12.75
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU        3996357        266      25.11      12.86
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU      914994       1093      54.84      56.15
+
+
+===========
+Explanation
+===========
+
+.. list-table::
+    :header-rows: 1
+
+    * - Options
+      - Usage
+    * - Benchmark
+      - Benchmark unit name.
+    * - Time
+      - Time of one round.
+    * - Iterations
+      - the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s.
+    * - Input
+      - The bandwidth of dealing with input. the unit is MB/s.
+    * - GMACPS
+      - The speed of running MACs(multiply-accumulation). the unit is G/s.
+
+Model Benchmark
+---------------
+
+Model Benchmark is used for test and optimize the performance of your model.
+This tool could record the running time of the model and the detailed running information of each operator of your model.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
+
+======
+Output
+======
+
+    .. code:: bash
+
+        I benchmark_model.cc:158 ---------------------------------------------------------------------
+        I benchmark_model.cc:158                                Warm Up
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |   std |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 |     1 |    51.481 |   51.481 |  51.481 |  51.481 |  51.481 | 0.000 |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158                          Run without statistics
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |      std |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    30.272 |   31.390 |  29.938 |  45.966 |  30.913 | 1850.983 |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 -----------------------------------------------------------------------
+        I benchmark_model.cc:158                           Run with statistics
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    32.358 |   33.327 |  32.293 |  33.607 |  33.002 | 310.435 |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                                      Sort by Run Order
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type |  Start | First | Avg(ms) |     % |    cdf% | GMACPS | Stride |   Pad |    Filter Shape |   Output Shape | Dilation |                                               name |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |       Transpose |  0.000 | 0.102 |   0.100 | 0.315 |   0.315 |  0.000 |        |       |                 |  [1,3,224,224] |          |                                              input |
+        I statistics.cc:343 |          Conv2D |  0.107 | 1.541 |   1.570 | 4.943 |   5.258 |  6.904 |  [2,2] |  SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |             MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 | DepthwiseConv2d |  1.724 | 0.936 |   0.944 | 2.972 |   8.230 |  3.827 |  [1,1] |  SAME |      [1,32,3,3] | [1,32,112,112] |    [1,1] |   MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 |
+        I statistics.cc:343 |         Softmax | 32.835 | 0.039 |   0.042 | 0.131 |  99.996 |  0.000 |        |       |                 |       [1,1001] |          |                    MobilenetV1/Predictions/Softmax |
+        I statistics.cc:343 |        Identity | 32.880 | 0.001 |   0.001 | 0.004 | 100.000 |  0.000 |        |       |                 |       [1,1001] |          | mace_output_node_MobilenetV1/Predictions/Reshape_1 |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                              Sort by Computation Time
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 | Op Type |  Start | First | Avg(ms) |     % |   cdf% | GMACPS | Stride |  Pad |    Filter Shape |   Output Shape | Dilation |                                              name |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |  Conv2D | 30.093 | 2.102 |   2.198 | 6.922 |  6.922 | 23.372 |  [1,1] | SAME | [1024,1024,1,1] |   [1,1024,7,7] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  7.823 | 2.115 |   2.164 | 6.813 | 13.735 | 23.747 |  [1,1] | SAME |   [128,128,1,1] |  [1,128,56,56] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 15.859 | 2.119 |   2.109 | 6.642 | 20.377 | 24.358 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 23.619 | 2.087 |   2.096 | 6.599 | 26.976 | 24.517 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 26.204 | 2.081 |   2.093 | 6.590 | 33.567 | 24.549 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 21.038 | 2.036 |   2.091 | 6.585 | 40.152 | 24.569 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 18.465 | 2.034 |   2.082 | 6.554 | 46.706 | 24.684 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  2.709 | 1.984 |   2.058 | 6.482 | 53.188 | 12.480 |  [1,1] | SAME |     [64,32,1,1] | [1,64,112,112] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 12.220 | 1.788 |   1.901 | 5.986 | 59.174 | 27.027 |  [1,1] | SAME |   [256,256,1,1] |  [1,256,28,28] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  0.107 | 1.541 |   1.570 | 4.943 | 64.117 |  6.904 |  [2,2] | SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |            MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343                                        Stat by Op Type
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type | Count | Avg(ms) |      % |    cdf% |        MACs | GMACPS | Called times |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |          Conv2D |    15 |  24.978 | 78.693 |  78.693 | 551,355,392 | 22.074 |           15 |
+        I statistics.cc:343 | DepthwiseConv2d |    13 |   6.543 | 20.614 |  99.307 |  17,385,984 |  2.657 |           13 |
+        I statistics.cc:343 |       Transpose |     1 |   0.100 |  0.315 |  99.622 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Pooling |     1 |   0.072 |  0.227 |  99.849 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Softmax |     1 |   0.041 |  0.129 |  99.978 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Squeeze |     1 |   0.006 |  0.019 |  99.997 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |        Identity |     1 |   0.001 |  0.003 | 100.000 |           0 |  0.000 |            1 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343           Stat by MACs(Multiply-Accumulation)
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 |       total | round | first(G/s) | avg(G/s) |     std |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 | 568,741,376 |   100 |     18.330 |   17.909 | 301.326 |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343                           Summary of Ops' Stat
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 |   100 |    31.028 |   32.093 |  31.028 |  32.346 |  31.758 | 301.326 |
+        I statistics.cc:343 ------------------------------------------------------------------------
+
+
+===========
+Explanation
+===========
+
+There are 8 sections of the output information.
+
+1. **Warm Up**
+
+This section lists the time information of warm-up run.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - round
+      - the number of round has been run.
+    * - first
+      - the run time of first round. unit is millisecond.
+    * - curr
+      - the run time of last round. unit is millisecond.
+    * - min
+      - the minimal run time of all rounds. unit is millisecond.
+    * - max
+      - the maximal run time of all rounds. unit is millisecond.
+    * - avg
+      - the average run time of all rounds. unit is millisecond.
+    * - std
+      - the standard deviation of all rounds.
+
+2. **Run without statistics**
+
+This section lists the run time information without statistics code.
+ the detailed explanation is the same as the section of Warm Up.
+
+3. **Run with statistics**
+
+This section lists the run time information with statistics code,
+ the time maybe longer compared with the second section.
+ the detailed explanation is the same as the section of Warm Up.
+
+4. **Sort by Run Order**
+
+This section lists the detailed run information of every operator in your model.
+The operators is listed based on the run order, Every line is an operator of your model.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - Op Type
+      - the type of operator.
+    * - Start
+      - the start time of the operator. unit is millisecond.
+    * - First
+      - the run time of first round. unit is millisecond.
+    * - Avg
+      - the average run time of all rounds. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - GMACPS
+      - The number of run MACs(multiply-accumulation) per second. the unit is G/s.
+    * - Stride
+      - the stride parameter of the operator if exist.
+    * - Pad
+      - the pad parameter of the operator if exist.
+    * - Filter Shape
+      - the filter shape of the operator if exist.
+    * - Output Shape
+      - the output shape of the operator.
+    * - Dilation
+      - the dilation parameter of the operator if exist.
+    * - Name
+      - the name of the operator.
+
+5. **Sort by Computation time**
+
+This section lists the top-10 most time-consuming operators.
+The operators is listed based on the computation time,
+the detailed explanation is the same as previous section.
+
+6. **Stat by Op Type**
+
+This section stats the run information about operators based on operator type.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Op Type
+      - the type of operator.
+    * - Count
+      - the number of operators with the type.
+    * - Avg
+      - the average run time of the operator. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - MACs
+      - The number of MACs(multiply-accumulation).
+    * - GMACPS
+      - The number of MACs(multiply-accumulation) runs per second. the unit is G/s.
+    * - Called times
+      - the number of called times in all rounds.
+
+7. **Stat by MACs**
+
+This section stats the MACs information of your model.
+
+.. list-table::
+    :header-rows: 1
+
+    * - total
+      - the number of MACs of your model.
+    * - round
+      - the number of round has been run.
+    * - First
+      - the GMAPS of first round. unit is G/s.
+    * - Avg
+      - the average GMAPS of all rounds. unit is G/s.
+    * - std
+      - the standard deviation of all rounds.
+
+8. **Summary of Ops' Stat**
+
+This section lists the run time information which is summation of every operator's run time.
+which may be shorter than the model's run time with statistics.
+the detailed explanation is the same as the section of Warm Up.
diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD
index b086ad24..3fe9a006 100644
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -15,6 +15,7 @@ cc_library(
     srcs = ["statistics.cc"],
     hdrs = ["statistics.h"],
     copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+    visibility = ["//visibility:public"],
     deps = [
         "//mace/utils",
     ],
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 7f0afe24..bcb9ae75 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) {
   return result;
 }
 
-bool SplitAndParseToInts(const std::string &str,
-                         char delims,
-                         std::vector<int64_t> *result) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    int64_t dim = atoi(tmp.data());
-    result->push_back(dim);
-    size_t next_offset = tmp.find(delims);
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return true;
-}
-
 }  //  namespace str_util
 
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
diff --git a/mace/benchmark/statistics.cc b/mace/benchmark/statistics.cc
index 0f05798c..7329c247 100644
--- a/mace/benchmark/statistics.cc
+++ b/mace/benchmark/statistics.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <functional>
 #include <set>
 
 #include "mace/benchmark/statistics.h"
@@ -53,7 +54,6 @@ std::string ShapeToString(
   if (output_shape.empty()) {
     return "";
   }
-
   std::stringstream stream;
   stream << "[";
   for (size_t i = 0; i < output_shape.size(); ++i) {
@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) {
 
 }  // namespace
 
+
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape) {
+  int64_t macs = 0;
+  if (op_type == "Conv2D" || op_type == "Deconv2D") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3]
+        * filter_shape[2] * filter_shape[3] * filter_shape[1];
+  } else if (op_type == "MatMul") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>())
+        * filter_shape.back();
+  } else if (op_type == "DepthwiseConv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "DepthwiseDeconv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "FullyConnected") {
+    macs = output_shape[0] * std::accumulate(filter_shape.begin(),
+                                             filter_shape.end(),
+                                             1,
+                                             std::multiplies<int64_t>());
+  } else if (op_type == "BatchNorm") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>());
+  } else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") {
+    macs = 3 * std::accumulate(output_shape.begin(),
+                               output_shape.end(),
+                               1,
+                               std::multiplies<int64_t>());
+  }
+  return macs;
+}
+
 void OpStat::StatMetadata(const RunMetadata &meta_data) {
   if (meta_data.op_stats.empty()) {
     LOG(FATAL) << "Op metadata should not be empty";
@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) {
       record->type = op_stat.type;
       record->args = op_stat.args;
       record->output_shape = op_stat.output_shape;
+      record->macs =
+          StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]);
       record->order = order_idx;
       order_idx += 1;
     }
@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric,
   // generate string
   std::string title = "Sort by " + MetricToString(metric);
   const std::vector<std::string> header = {
-      "Node Type", "Start", "First", "Avg(ms)", "%", "cdf%",
+      "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS",
       "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name"
   };
   std::vector<std::vector<std::string>> data;
@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric,
         FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3));
     tuple.push_back(
         FloatToString(accumulate_time * 100.f / total_time_.sum(), 3));
+    tuple.push_back(FloatToString(
+        record.macs < 1e-6 ? record.macs :
+        (record.macs * 1e-3) / record.rel_end.avg(), 3));
     tuple.push_back(VectorToString<int>(record.args.strides));
     if (record.args.padding_type != -1) {
       tuple.push_back(PaddingTypeToString(record.args.padding_type));
@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric,
   return mace::string_util::StringFormatter::Table(title, header, data);
 }
 
-std::string OpStat::StatByNodeType() const {
+std::string OpStat::StatByOpType() const {
   if (records_.empty()) {
     return "";
   }
   const int64_t round = total_time_.round();
   int64_t total_time = 0;
   std::map<std::string, int64_t> type_time_map;
+  std::map<std::string, int64_t> type_macs_map;
   std::map<std::string, int64_t> type_count_map;
   std::map<std::string, int64_t> type_called_times_map;
-  std::set<std::string> node_types_set;
+  std::set<std::string> op_types_set;
   for (auto &record : records_) {
-    std::string node_type = record.second.type;
-    node_types_set.insert(node_type);
+    std::string op_type = record.second.type;
+    op_types_set.insert(op_type);
 
-    type_time_map[node_type] += record.second.rel_end.sum() / round;
+    type_time_map[op_type] += record.second.rel_end.sum() / round;
+    type_macs_map[op_type] += record.second.macs;
     total_time += record.second.rel_end.sum() / round;
-    type_count_map[node_type] += 1;
-    type_called_times_map[node_type] += record.second.called_times / round;
+    type_count_map[op_type] += 1;
+    type_called_times_map[op_type] += record.second.called_times / round;
   }
-  std::vector<std::string> node_types(node_types_set.begin(),
-                                      node_types_set.end());
-  std::sort(node_types.begin(), node_types.end(),
+  std::vector<std::string> op_types(op_types_set.begin(),
+                                    op_types_set.end());
+  std::sort(op_types.begin(), op_types.end(),
             [&](const std::string &lhs, const std::string &rhs) {
               return type_time_map[lhs] > type_time_map[rhs];
             });
 
-  std::string title = "Stat by node type";
+  std::string title = "Stat by Op Type";
   const std::vector<std::string> header = {
-      "Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times"
+      "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs",
+      "GMACPS", "Called times"
   };
 
   float cdf = 0.0f;
   std::vector<std::vector<std::string>> data;
-  for (auto type : node_types) {
+  for (auto type : op_types) {
     const float avg_time = type_time_map[type] / 1000.0f;
     const float percentage = type_time_map[type] * 100.0f / total_time;
     cdf += percentage;
@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const {
     tuple.push_back(FloatToString(avg_time, 3));
     tuple.push_back(FloatToString(percentage, 3));
     tuple.push_back(FloatToString(cdf, 3));
+    tuple.push_back(IntToString(type_macs_map[type]));
+    tuple.push_back(FloatToString(
+        type_macs_map[type] < 1e-6 ? type_macs_map[type] :
+        (type_macs_map[type] * 1e-3) / type_time_map[type], 3));
     tuple.push_back(IntToString(type_called_times_map[type]));
     data.emplace_back(tuple);
   }
   return mace::string_util::StringFormatter::Table(title, header, data);
 }
 
+
+std::string OpStat::StatByMACs() const {
+  if (records_.empty()) {
+    return "";
+  }
+  const int64_t round = total_time_.round();
+  int64_t count = 0;
+  for (auto &record : records_) {
+    count += record.second.macs;
+  }
+
+  std::string title = "Stat by MACs(Multiply-Accumulation)";
+  const std::vector<std::string> header = {
+      "total", "round", "first(G/s)", "avg(G/s)", "std"
+  };
+
+  std::vector<std::vector<std::string>> data;
+  std::vector<std::string> tuple;
+  tuple.push_back(IntToString(count));
+  tuple.push_back(IntToString(round));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3));
+  tuple.push_back(FloatToString(total_time_.std_deviation(), 3));
+  data.emplace_back(tuple);
+  return mace::string_util::StringFormatter::Table(title, header, data);
+}
+
 std::string OpStat::Summary() const {
   std::stringstream stream;
   if (!records_.empty()) {
@@ -252,9 +331,11 @@ void OpStat::PrintStat() const {
     stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl;
     // top-10 op stat by time
     stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl;
-    // op stat by node type
-    stream << StatByNodeType() << std::endl;
+    // op stat by op type
+    stream << StatByOpType() << std::endl;
   }
+  // print MACs statistics
+  stream << StatByMACs();
   // Print summary
   stream << Summary();
 
diff --git a/mace/benchmark/statistics.h b/mace/benchmark/statistics.h
index 52f963e5..f0cf2be6 100644
--- a/mace/benchmark/statistics.h
+++ b/mace/benchmark/statistics.h
@@ -19,6 +19,7 @@
 #include <cmath>
 #include <iomanip>
 #include <limits>
+#include <locale>
 #include <map>
 #include <sstream>
 #include <string>
@@ -33,11 +34,33 @@ class RunMetadata;
 
 namespace benchmark {
 
+// stat the number of multiply-accumulate(MAC)
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape);
+
 template <typename IntType>
 std::string IntToString(const IntType v) {
   std::stringstream stream;
   stream << v;
-  return stream.str();
+  std::string src_str = stream.str();
+  size_t size = src_str.size();
+  size_t dst_size = size + ((size-1) / 3);
+  if (src_str[0] == '-') {
+    dst_size = size + ((size-2) / 3);
+  }
+  std::string result(dst_size, ',');
+  size_t dst_idx = dst_size - 1;
+  for (size_t src_idx = 0; src_idx < size; ++src_idx) {
+    if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) {
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    } else {
+      dst_idx -= 1;
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    }
+    dst_idx -= 1;
+  }
+  return result;
 }
 
 template <typename FloatType>
@@ -127,7 +150,7 @@ enum Metric {
   COMPUTATION_TIME,
 };
 
-class OpStat{
+class OpStat {
  public:
   void StatMetadata(const RunMetadata &meta_data);
 
@@ -136,7 +159,8 @@ class OpStat{
  private:
   std::string StatByMetric(const Metric metric,
       const int top_limit) const;
-  std::string StatByNodeType() const;
+  std::string StatByOpType() const;
+  std::string StatByMACs() const;
   std::string Summary() const;
 
  private:
@@ -145,6 +169,7 @@ class OpStat{
     std::string type;
     std::vector<std::vector<int64_t>> output_shape;
     ConvPoolArgs args;
+    int64_t macs;
     int64_t order;
     TimeInfo<int64_t> start;
     TimeInfo<int64_t> rel_end;
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 1732cfe1..7912a6d4 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
       std::string type = op->debug_def().type();
 
       if (type.compare("Conv2D") == 0 ||
-          type.compare("FusedConv2D") == 0 ||
+          type.compare("Deconv2D") == 0 ||
           type.compare("DepthwiseConv2d") == 0 ||
+          type.compare("DepthwiseDeconv2d") == 0 ||
           type.compare("Pooling") == 0) {
         strides = op->GetRepeatedArgs<int>("strides");
         padding_type = op->GetOptionalArg<int>("padding", -1);
@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
         } else {
           kernels = op->Input(1)->shape();
         }
+      } else if (type.compare("MatMul") == 0) {
+        bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
+        kernels = op->Input(0)->shape();
+        if (transpose_a) {
+          std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
+        }
+      } else if (type.compare("FullyConnected") == 0) {
+        kernels = op->Input(1)->shape();
       }
 
       std::vector<std::vector<int64_t>> output_shapes;
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 5da17509..57be33c2 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -28,7 +28,7 @@ namespace testing {
 
 static std::vector<Benchmark *> *all_benchmarks = nullptr;
 static int64_t bytes_processed;
-static int64_t macc_processed;
+static int64_t macs_processed = 0;
 static int64_t accum_time = 0;
 static int64_t start_time = 0;
 
@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) {
   // Internal perf regression tools depends on the output formatting,
   // please keep in consistent when modifying
   printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
-         "Iterations", "Input(MB/s)", "MACC(G/s)");
-  printf("%s\n", std::string(width + 44, '-').c_str());
+         "Iterations", "Input(MB/s)", "GMACPS");
+  printf("%s\n", std::string(width + 45, '-').c_str());
   for (auto b : *all_benchmarks) {
     if (!std::regex_match(b->name_, match, regex)) continue;
     int iters;
@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) {
     b->Run(&iters, &seconds);
     float mbps = (bytes_processed * 1e-6) / seconds;
     // MACCs or other computations
-    float gmaccs = (macc_processed * 1e-9) / seconds;
+    float gmacs = (macs_processed * 1e-9) / seconds;
     printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(),
-           seconds * 1e9 / iters, iters, mbps, gmaccs);
+           seconds * 1e9 / iters, iters, mbps, gmacs);
   }
 }
 
@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
   int64_t iters = kMinIters;
   while (true) {
     bytes_processed = -1;
-    macc_processed = -1;
+    macs_processed = 0;
     RestartTiming();
     (*benchmark_func_)(iters);
     StopTiming();
@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
 }
 
 void BytesProcessed(int64_t n) { bytes_processed = n; }
-void MaccProcessed(int64_t n) { macc_processed = n; }
+void MacsProcessed(int64_t n) { macs_processed = n; }
 void RestartTiming() {
   accum_time = 0;
   start_time = NowMicros();
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index b6c070c7..2eb91e40 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,7 +42,7 @@ class Benchmark {
 };
 
 void BytesProcessed(int64_t);
-void MaccProcessed(int64_t);
+void MacsProcessed(int64_t);
 void RestartTiming();
 void StartTiming();
 void StopTiming();
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 1d8c821d..f6e01a74 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -230,6 +230,7 @@ cc_test(
     linkstatic = 1,
     deps = [
         "test",
+        "//mace/benchmark:statistics",
         "//mace/core:test_benchmark_main",
         "//third_party/eigen3",
     ],
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 76447e9b..6faf62ce 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                            \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                            \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
@@ -292,7 +288,6 @@ void SigmoidBenchmark(
   static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
   }                                                                       \
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index f5e11740..b9751557 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
       MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(      \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                   \
   }                                                                           \
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index d3467e76..a6afcb07 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -75,7 +75,7 @@ void BatchNorm(
   static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                          \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;        \
-    mace::testing::MaccProcessed(tot);                                      \
+    mace::testing::MacsProcessed(tot);                                      \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
     BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                         \
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 9664a917..64264936 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -58,7 +58,6 @@ void BMBatchToSpace(
       MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                      \
   }                                                                            \
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 9026ffb2..f0604d56 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
   static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                       \
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index f5f1df41..4ba0f64c 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters,
   static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W);            \
   }                                                                  \
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index db5f8494..8ea6d139 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -61,7 +61,6 @@ void ChannelShuffle(
       MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G);                        \
   }                                                                            \
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index a43fc308..eaff9b44 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) {
     net.Run();
   }
   const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2;
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters,
   const int64_t tot =
       static_cast<int64_t>(iters) *
       (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 91efff79..a0e78003 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters,
         (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;        \
     int64_t ow =                                                              \
         (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;        \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC});                      \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION,         \
                          mace::Padding::P, OC);                               \
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index aad6f93d..5133a28a 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) {
     net.RunOp(D);
   }
   const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters,
   const int64_t tot =
       static_cast<int64_t>(iters) *
       (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index 81be17c0..9a2c405d 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -90,9 +91,10 @@ static void Deconv2d(int iters,
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
     int64_t oh = OH;                                                          \
     int64_t ow = OW;                                                          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC});                    \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW,         \
                          mace::Padding::P, OC);                               \
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index c9c6dd40..1283e432 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -62,7 +62,6 @@ void DepthToSpace(
       MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
   }                                                                           \
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 4d44a9bc..c5aee849 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters,
         (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;         \
     int64_t ow =                                                               \
         (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;         \
-    const int64_t macc =                                                       \
-        static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    const int64_t macs =                                                       \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(               \
+            "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C});                \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,           \
                                   mace::Padding::P, M);                        \
diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc
index 081e10d2..a130ca1d 100644
--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters,
         ##_##TYPE##_##DEVICE(                                                 \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * H * W * KH * KW * C;   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C});               \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
-    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);        \
+    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);         \
   }                                                                           \
   MACE_BENCHMARK(                                                             \
     MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 95808bc3..b75149bd 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -66,7 +66,6 @@ void EltwiseBenchmark(
       MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     EltwiseBenchmark<DEVICE, TYPE>(                                           \
         iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C);      \
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index bb27c97d..bb6dcd80 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>(
 #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
   static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
       int iters) {                                                         \
-    const int64_t macc =                                                   \
-        static_cast<int64_t>(iters) * N * C * H * W * OC + OC;             \
+    const int64_t macs =                                                   \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(           \
+            "FullyConnected", {OC, H, W, C}, {N, 1, 1, OC});               \
     const int64_t tot =                                                    \
         static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC;           \
-    mace::testing::MaccProcessed(macc);                                    \
+    mace::testing::MacsProcessed(macs);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
     FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);                      \
   }                                                                        \
diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc
index 5e52875c..7fe4a0fb 100644
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
@@ -66,7 +66,6 @@ void GatherBenchmark(int iters,
       MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \
           int iters) {                                                    \
     const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED;    \
-    mace::testing::MaccProcessed(0);                                      \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED);             \
   }                                                                       \
diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc
index b917c495..61207af0 100644
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -59,7 +59,6 @@ static void LocalResponseNorm(
       MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(   \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                            \
diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc
index 6568025a..a3b96094 100644
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
   static void                                                                  \
       MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\
         int iters) {                                                           \
-    const int64_t macc =                                                       \
+    const int64_t macs =                                                       \
         static_cast<int64_t>(                                                  \
             iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS;       \
     const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE;          \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot * (sizeof(TYPE)));                       \
     LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS);                \
   }                                                                            \
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index f118e63f..1996587a 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "public/gemmlowp.h"
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/gemm.h"
 #include "mace/ops/sgemm.h"
@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
 
 #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
   static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
+    const int64_t macs = static_cast<int64_t>(iters) *             \
+        mace::benchmark::StatMACs("MatMul", {K}, {M, N});          \
     const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
-    mace::testing::MaccProcessed(macc);                            \
+    mace::testing::MacsProcessed(macs);                            \
     mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
     MatmulBenchmark_##FUNC(iters, M, K, N);                        \
   }                                                                \
@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                         \
   static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(        \
       int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                          \
   }                                                                            \
@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE)               \
   static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(    \
       int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                 \
   }                                                                            \
diff --git a/mace/ops/memory_benchmark.cc b/mace/ops/memory_benchmark.cc
index e3bb30a8..73f3bdeb 100644
--- a/mace/ops/memory_benchmark.cc
+++ b/mace/ops/memory_benchmark.cc
@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW(
   static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot * sizeof(float));              \
     MemoryAccessBenchmark_##ORDER(iters, N, H, W, C);                \
   }                                                                  \
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index fb7f4e14..0125b4f5 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height,
   static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
       int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD);                               \
   }                                                                          \
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index c48cc877..880c0cad 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -81,7 +81,6 @@ void Pooling(int iters,
         ##TYPE##_##DEVICE(                                                     \
           int iters) {                                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,          \
                     PoolingType::PO);                                          \
diff --git a/mace/ops/quantize_benchmark.cc b/mace/ops/quantize_benchmark.cc
index 62a534b7..0c1493b8 100644
--- a/mace/ops/quantize_benchmark.cc
+++ b/mace/ops/quantize_benchmark.cc
@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) {
     MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE(              \
       int iters) {                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
     Quantize<DEVICE, TYPE>(iters, N);                      \
   }                                                        \
@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) {
     MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE(            \
       int iters) {                                         \
     const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
     Dequantize<DEVICE, TYPE>(iters, N);                    \
   }                                                        \
diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc
index ec8807b0..663c3b45 100644
--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels,
     MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     Reduce<DEVICE, TYPE>(iters, N, C, H, W);        \
   }                                                                  \
diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc
index 5ababeba..85e073fd 100644
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters,
       MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
         ##DEVICE(                                                             \
           int iters) {                                                        \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;   \
+    const int64_t macs = static_cast<int64_t>(iters) *                        \
+        mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C});       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;        \
-    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::MacsProcessed(macs);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
   }                                                                           \
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index bace4f10..ddc0f508 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters,
       MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
         ##DEVICE(                                                              \
           int iters) {                                                         \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;    \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C});       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;         \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
   }                                                                            \
diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc
index 9630f696..9b7a915a 100644
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) {
 #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE)                   \
   static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
-    const int64_t macc =                                                  \
+    const int64_t macs =                                                  \
         static_cast<int64_t>(iters) * N * C * H * W;                      \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(macc);                                   \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     Reverse<DEVICE, TYPE>(iters, N, C, H, W);                             \
   }                                                                       \
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 25095da5..819544b2 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>(
   static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
     SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
   }                                                                       \
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index cacadfcd..168461de 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -64,7 +64,6 @@ void BMSpaceToBatch(
     MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
         int iters) {                                                           \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
     BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                    \
   }                                                                            \
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index 3311d618..6bd7755e 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -62,7 +62,6 @@ void SpaceToDepth(
       MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
           int iters) {                                                        \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
   }                                                                           \
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index b21da8f5..020c3214 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -65,7 +65,7 @@ void BMSplitHelper(int iters,
       MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(        \
           int iters) {                                                       \
         const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-        mace::testing::MaccProcessed(tot);                                   \
+        mace::testing::MacsProcessed(tot);                                   \
         mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
         BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO);                \
       }                                                                      \
diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc
index 353d8e7a..1d2a7aa3 100644
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels,
     MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W);        \
   }                                                                  \
diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc
index f584239a..372f2f9d 100644
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters,
   static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE(     \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * H * W;         \
-    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0});         \
   }                                                                  \
@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters,
     MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
       DEVICE(int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3});  \
   }                                                                           \
diff --git a/mace/test/BUILD b/mace/test/BUILD
index 63faecfe..283dd486 100644
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
@@ -7,10 +7,10 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
-    "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
     "if_neon_enabled",
+    "if_openmp_enabled",
+    "if_android_armv7",
+    "if_hexagon_enabled",
     "if_opencl_enabled",
     "if_quantize_enabled",
 )
@@ -32,16 +32,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -62,16 +65,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
@@ -92,16 +98,19 @@ cc_test(
         "-Wextra",
         "-Wno-missing-field-initializers",
     ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
     ]) + if_opencl_enabled([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_quantize_enabled([
         "-DMACE_ENABLE_QUANTIZE",
     ]) + if_hexagon_enabled([
         "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
     ]),
     linkopts = ["-fopenmp"],
     linkstatic = 1,
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 71f3cc14..96ad13a4 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
         if len(parts) == 5 and parts[0].startswith("BM_"):
             metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
             metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
-            metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
+            metrics["%s.gmac_per_sec" % parts[0]] = parts[4]
 
     # platform = dev[YAMLKeyword.target_socs]
     # model = dev[YAMLKeyword.device_name]
-- 
GitLab