Add MACs statistics for model benchmark tool and related docs

5f021151 · liuqi · b31a29a3 · 5f021151 · 5f021151 · 5f021151
49 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ stages:
  - api_test
  - python_tools_tests
  - model_tests
+  - quantization_tests
  - build_android_demo
  - ops_benchmark
  - extra_tests
@@ -62,6 +63,14 @@ api_test:
    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS

 ops_benchmark:
  stage: ops_benchmark
@@ -103,7 +112,7 @@ ndk_versions_compatible_tests:
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
    - >
-      for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
+      for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
      do
      new_ndk_path=${prefix_path}${ndk};
      if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
@@ -111,8 +120,12 @@ ndk_versions_compatible_tests:
        export PATH=$ANDROID_NDK_HOME:$PATH;
        echo "ndk path: $ANDROID_NDK_HOME";
        if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
      fi
      done
    - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
@@ -131,9 +144,9 @@ python_tools_tests:
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
    - >
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;

 model_tests:
  stage: model_tests
@@ -142,23 +155,39 @@ model_tests:
    - rm -rf mace-models
    - rm -rf generic-mobile-devices
    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
    - >
      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
    - >
-      for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
-      do
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      done
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
    - >
      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+    - rm -rf mace-models
+
+quantization_tests:
+  stage: quantization_tests
+  script:
+    - pwd
+    - rm -rf mace-models
+    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
    - rm -rf mace-models

 build_android_demo:

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -27,6 +27,7 @@ The main documentation is organized into the following sections:

   user_guide/basic_usage
   user_guide/advanced_usage
+   user_guide/benchmark
   user_guide/op_lists
   user_guide/quantization_usage


--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -379,6 +379,8 @@ Useful Commands

 * **benchmark and profile model**

+the detailed information is in :doc:`benchmark`.
+
 .. code:: sh

    # Benchmark model, get detailed statistics of each Op.

--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -227,7 +227,7 @@ to run and validate your model.

 * **benchmark**

-    benchmark and profile the model.
+    benchmark and profile the model. the details are in :doc:`benchmark`.

    .. code:: sh


--- a/docs/user_guide/benchmark.rst
+++ b/docs/user_guide/benchmark.rst
+Benchmark usage
+===============
+
+This part contains the usage of MACE benchmark tools.
+
+Overview
+--------
+
+As mentioned in the previous part, there are two kinds of benchmark tools,
+one for operator and the other for model.
+
+Operator Benchmark
+------------------
+
+Operator Benchmark is used for test and optimize the performance of specific operator.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True  --args="--filter=.*BM_CONV.*"
+
+======
+Output
+======
+
+    .. code:: bash
+
+        Benchmark                                                    Time(ns) Iterations Input(MB/s)   GMACPS
+        ------------------------------------------------------------------------------------------------------
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU       1759129        479     114.09      29.21
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU       4031301        226      49.79      12.75
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU        3996357        266      25.11      12.86
+        MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU      914994       1093      54.84      56.15
+
+
+===========
+Explanation
+===========
+
+.. list-table::
+    :header-rows: 1
+
+    * - Options
+      - Usage
+    * - Benchmark
+      - Benchmark unit name.
+    * - Time
+      - Time of one round.
+    * - Iterations
+      - the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s.
+    * - Input
+      - The bandwidth of dealing with input. the unit is MB/s.
+    * - GMACPS
+      - The speed of running MACs(multiply-accumulation). the unit is G/s.
+
+Model Benchmark
+---------------
+
+Model Benchmark is used for test and optimize the performance of your model.
+This tool could record the running time of the model and the detailed running information of each operator of your model.
+
+=====
+Usage
+=====
+
+    .. code:: bash
+
+        python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
+
+======
+Output
+======
+
+    .. code:: bash
+
+        I benchmark_model.cc:158 ---------------------------------------------------------------------
+        I benchmark_model.cc:158                                Warm Up
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |   std |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158 |     1 |    51.481 |   51.481 |  51.481 |  51.481 |  51.481 | 0.000 |
+        I benchmark_model.cc:158 ----------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158                          Run without statistics
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |      std |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    30.272 |   31.390 |  29.938 |  45.966 |  30.913 | 1850.983 |
+        I benchmark_model.cc:158 -------------------------------------------------------------------------
+        I benchmark_model.cc:158
+        I benchmark_model.cc:158 -----------------------------------------------------------------------
+        I benchmark_model.cc:158                           Run with statistics
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I benchmark_model.cc:158 |   100 |    32.358 |   33.327 |  32.293 |  33.607 |  33.002 | 310.435 |
+        I benchmark_model.cc:158 ------------------------------------------------------------------------
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                                      Sort by Run Order
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type |  Start | First | Avg(ms) |     % |    cdf% | GMACPS | Stride |   Pad |    Filter Shape |   Output Shape | Dilation |                                               name |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |       Transpose |  0.000 | 0.102 |   0.100 | 0.315 |   0.315 |  0.000 |        |       |                 |  [1,3,224,224] |          |                                              input |
+        I statistics.cc:343 |          Conv2D |  0.107 | 1.541 |   1.570 | 4.943 |   5.258 |  6.904 |  [2,2] |  SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |             MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 | DepthwiseConv2d |  1.724 | 0.936 |   0.944 | 2.972 |   8.230 |  3.827 |  [1,1] |  SAME |      [1,32,3,3] | [1,32,112,112] |    [1,1] |   MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 |
+        I statistics.cc:343 |         Softmax | 32.835 | 0.039 |   0.042 | 0.131 |  99.996 |  0.000 |        |       |                 |       [1,1001] |          |                    MobilenetV1/Predictions/Softmax |
+        I statistics.cc:343 |        Identity | 32.880 | 0.001 |   0.001 | 0.004 | 100.000 |  0.000 |        |       |                 |       [1,1001] |          | mace_output_node_MobilenetV1/Predictions/Reshape_1 |
+        I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343                                                                              Sort by Computation Time
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 | Op Type |  Start | First | Avg(ms) |     % |   cdf% | GMACPS | Stride |  Pad |    Filter Shape |   Output Shape | Dilation |                                              name |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343 |  Conv2D | 30.093 | 2.102 |   2.198 | 6.922 |  6.922 | 23.372 |  [1,1] | SAME | [1024,1024,1,1] |   [1,1024,7,7] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  7.823 | 2.115 |   2.164 | 6.813 | 13.735 | 23.747 |  [1,1] | SAME |   [128,128,1,1] |  [1,128,56,56] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 15.859 | 2.119 |   2.109 | 6.642 | 20.377 | 24.358 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 23.619 | 2.087 |   2.096 | 6.599 | 26.976 | 24.517 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 26.204 | 2.081 |   2.093 | 6.590 | 33.567 | 24.549 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 21.038 | 2.036 |   2.091 | 6.585 | 40.152 | 24.569 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 18.465 | 2.034 |   2.082 | 6.554 | 46.706 | 24.684 |  [1,1] | SAME |   [512,512,1,1] |  [1,512,14,14] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  2.709 | 1.984 |   2.058 | 6.482 | 53.188 | 12.480 |  [1,1] | SAME |     [64,32,1,1] | [1,64,112,112] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D | 12.220 | 1.788 |   1.901 | 5.986 | 59.174 | 27.027 |  [1,1] | SAME |   [256,256,1,1] |  [1,256,28,28] |    [1,1] |  MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 |
+        I statistics.cc:343 |  Conv2D |  0.107 | 1.541 |   1.570 | 4.943 | 64.117 |  6.904 |  [2,2] | SAME |      [32,3,3,3] | [1,32,112,112] |    [1,1] |            MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343                                        Stat by Op Type
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |         Op Type | Count | Avg(ms) |      % |    cdf% |        MACs | GMACPS | Called times |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343 |          Conv2D |    15 |  24.978 | 78.693 |  78.693 | 551,355,392 | 22.074 |           15 |
+        I statistics.cc:343 | DepthwiseConv2d |    13 |   6.543 | 20.614 |  99.307 |  17,385,984 |  2.657 |           13 |
+        I statistics.cc:343 |       Transpose |     1 |   0.100 |  0.315 |  99.622 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Pooling |     1 |   0.072 |  0.227 |  99.849 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Softmax |     1 |   0.041 |  0.129 |  99.978 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |         Squeeze |     1 |   0.006 |  0.019 |  99.997 |           0 |  0.000 |            1 |
+        I statistics.cc:343 |        Identity |     1 |   0.001 |  0.003 | 100.000 |           0 |  0.000 |            1 |
+        I statistics.cc:343 ----------------------------------------------------------------------------------------------
+        I statistics.cc:343
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343           Stat by MACs(Multiply-Accumulation)
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 |       total | round | first(G/s) | avg(G/s) |     std |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 | 568,741,376 |   100 |     18.330 |   17.909 | 301.326 |
+        I statistics.cc:343 ---------------------------------------------------------
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343                           Summary of Ops' Stat
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
+        I statistics.cc:343 ------------------------------------------------------------------------
+        I statistics.cc:343 |   100 |    31.028 |   32.093 |  31.028 |  32.346 |  31.758 | 301.326 |
+        I statistics.cc:343 ------------------------------------------------------------------------
+
+
+===========
+Explanation
+===========
+
+There are 8 sections of the output information.
+
+1. **Warm Up**
+
+This section lists the time information of warm-up run.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - round
+      - the number of round has been run.
+    * - first
+      - the run time of first round. unit is millisecond.
+    * - curr
+      - the run time of last round. unit is millisecond.
+    * - min
+      - the minimal run time of all rounds. unit is millisecond.
+    * - max
+      - the maximal run time of all rounds. unit is millisecond.
+    * - avg
+      - the average run time of all rounds. unit is millisecond.
+    * - std
+      - the standard deviation of all rounds.
+
+2. **Run without statistics**
+
+This section lists the run time information without statistics code.
+ the detailed explanation is the same as the section of Warm Up.
+
+3. **Run with statistics**
+
+This section lists the run time information with statistics code,
+ the time maybe longer compared with the second section.
+ the detailed explanation is the same as the section of Warm Up.
+
+4. **Sort by Run Order**
+
+This section lists the detailed run information of every operator in your model.
+The operators is listed based on the run order, Every line is an operator of your model.
+The detailed explanation is list as below.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Key
+      - Explanation
+    * - Op Type
+      - the type of operator.
+    * - Start
+      - the start time of the operator. unit is millisecond.
+    * - First
+      - the run time of first round. unit is millisecond.
+    * - Avg
+      - the average run time of all rounds. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - GMACPS
+      - The number of run MACs(multiply-accumulation) per second. the unit is G/s.
+    * - Stride
+      - the stride parameter of the operator if exist.
+    * - Pad
+      - the pad parameter of the operator if exist.
+    * - Filter Shape
+      - the filter shape of the operator if exist.
+    * - Output Shape
+      - the output shape of the operator.
+    * - Dilation
+      - the dilation parameter of the operator if exist.
+    * - Name
+      - the name of the operator.
+
+5. **Sort by Computation time**
+
+This section lists the top-10 most time-consuming operators.
+The operators is listed based on the computation time,
+the detailed explanation is the same as previous section.
+
+6. **Stat by Op Type**
+
+This section stats the run information about operators based on operator type.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Op Type
+      - the type of operator.
+    * - Count
+      - the number of operators with the type.
+    * - Avg
+      - the average run time of the operator. unit is millisecond.
+    * - %
+      - the percentage of total running time.
+    * - cdf%
+      - the cumulative percentage of running time.
+    * - MACs
+      - The number of MACs(multiply-accumulation).
+    * - GMACPS
+      - The number of MACs(multiply-accumulation) runs per second. the unit is G/s.
+    * - Called times
+      - the number of called times in all rounds.
+
+7. **Stat by MACs**
+
+This section stats the MACs information of your model.
+
+.. list-table::
+    :header-rows: 1
+
+    * - total
+      - the number of MACs of your model.
+    * - round
+      - the number of round has been run.
+    * - First
+      - the GMAPS of first round. unit is G/s.
+    * - Avg
+      - the average GMAPS of all rounds. unit is G/s.
+    * - std
+      - the standard deviation of all rounds.
+
+8. **Summary of Ops' Stat**
+
+This section lists the run time information which is summation of every operator's run time.
+which may be shorter than the model's run time with statistics.
+the detailed explanation is the same as the section of Warm Up.
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -15,6 +15,7 @@ cc_library(
    srcs = ["statistics.cc"],
    hdrs = ["statistics.h"],
    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+    visibility = ["//visibility:public"],
    deps = [
        "//mace/utils",
    ],

--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) {
  return result;
 }

-bool SplitAndParseToInts(const std::string &str,
-                         char delims,
-                         std::vector<int64_t> *result) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    int64_t dim = atoi(tmp.data());
-    result->push_back(dim);
-    size_t next_offset = tmp.find(delims);
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return true;
-}
-
 }  //  namespace str_util

 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {

--- a/mace/benchmark/statistics.cc
+++ b/mace/benchmark/statistics.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <functional>
 #include <set>

 #include "mace/benchmark/statistics.h"
@@ -53,7 +54,6 @@ std::string ShapeToString(
  if (output_shape.empty()) {
    return "";
  }
-
  std::stringstream stream;
  stream << "[";
  for (size_t i = 0; i < output_shape.size(); ++i) {
@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) {

 }  // namespace

+
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape) {
+  int64_t macs = 0;
+  if (op_type == "Conv2D" || op_type == "Deconv2D") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3]
+        * filter_shape[2] * filter_shape[3] * filter_shape[1];
+  } else if (op_type == "MatMul") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>())
+        * filter_shape.back();
+  } else if (op_type == "DepthwiseConv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "DepthwiseDeconv2d") {
+    macs = output_shape[0] * output_shape[1] * output_shape[2]
+        * output_shape[3] * filter_shape[2] * filter_shape[3];
+  } else if (op_type == "FullyConnected") {
+    macs = output_shape[0] * std::accumulate(filter_shape.begin(),
+                                             filter_shape.end(),
+                                             1,
+                                             std::multiplies<int64_t>());
+  } else if (op_type == "BatchNorm") {
+    macs = std::accumulate(output_shape.begin(),
+                           output_shape.end(),
+                           1,
+                           std::multiplies<int64_t>());
+  } else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") {
+    macs = 3 * std::accumulate(output_shape.begin(),
+                               output_shape.end(),
+                               1,
+                               std::multiplies<int64_t>());
+  }
+  return macs;
+}
+
 void OpStat::StatMetadata(const RunMetadata &meta_data) {
  if (meta_data.op_stats.empty()) {
    LOG(FATAL) << "Op metadata should not be empty";
@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) {
      record->type = op_stat.type;
      record->args = op_stat.args;
      record->output_shape = op_stat.output_shape;
+      record->macs =
+          StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]);
      record->order = order_idx;
      order_idx += 1;
    }
@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric,
  // generate string
  std::string title = "Sort by " + MetricToString(metric);
  const std::vector<std::string> header = {
-      "Node Type", "Start", "First", "Avg(ms)", "%", "cdf%",
+      "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS",
      "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name"
  };
  std::vector<std::vector<std::string>> data;
@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric,
        FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3));
    tuple.push_back(
        FloatToString(accumulate_time * 100.f / total_time_.sum(), 3));
+    tuple.push_back(FloatToString(
+        record.macs < 1e-6 ? record.macs :
+        (record.macs * 1e-3) / record.rel_end.avg(), 3));
    tuple.push_back(VectorToString<int>(record.args.strides));
    if (record.args.padding_type != -1) {
      tuple.push_back(PaddingTypeToString(record.args.padding_type));
@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric,
  return mace::string_util::StringFormatter::Table(title, header, data);
 }

-std::string OpStat::StatByNodeType() const {
+std::string OpStat::StatByOpType() const {
  if (records_.empty()) {
    return "";
  }
  const int64_t round = total_time_.round();
  int64_t total_time = 0;
  std::map<std::string, int64_t> type_time_map;
+  std::map<std::string, int64_t> type_macs_map;
  std::map<std::string, int64_t> type_count_map;
  std::map<std::string, int64_t> type_called_times_map;
-  std::set<std::string> node_types_set;
+  std::set<std::string> op_types_set;
  for (auto &record : records_) {
-    std::string node_type = record.second.type;
-    node_types_set.insert(node_type);
+    std::string op_type = record.second.type;
+    op_types_set.insert(op_type);

-    type_time_map[node_type] += record.second.rel_end.sum() / round;
+    type_time_map[op_type] += record.second.rel_end.sum() / round;
+    type_macs_map[op_type] += record.second.macs;
    total_time += record.second.rel_end.sum() / round;
-    type_count_map[node_type] += 1;
-    type_called_times_map[node_type] += record.second.called_times / round;
+    type_count_map[op_type] += 1;
+    type_called_times_map[op_type] += record.second.called_times / round;
  }
-  std::vector<std::string> node_types(node_types_set.begin(),
-                                      node_types_set.end());
-  std::sort(node_types.begin(), node_types.end(),
+  std::vector<std::string> op_types(op_types_set.begin(),
+                                    op_types_set.end());
+  std::sort(op_types.begin(), op_types.end(),
            [&](const std::string &lhs, const std::string &rhs) {
              return type_time_map[lhs] > type_time_map[rhs];
            });

-  std::string title = "Stat by node type";
+  std::string title = "Stat by Op Type";
  const std::vector<std::string> header = {
-      "Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times"
+      "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs",
+      "GMACPS", "Called times"
  };

  float cdf = 0.0f;
  std::vector<std::vector<std::string>> data;
-  for (auto type : node_types) {
+  for (auto type : op_types) {
    const float avg_time = type_time_map[type] / 1000.0f;
    const float percentage = type_time_map[type] * 100.0f / total_time;
    cdf += percentage;
@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const {
    tuple.push_back(FloatToString(avg_time, 3));
    tuple.push_back(FloatToString(percentage, 3));
    tuple.push_back(FloatToString(cdf, 3));
+    tuple.push_back(IntToString(type_macs_map[type]));
+    tuple.push_back(FloatToString(
+        type_macs_map[type] < 1e-6 ? type_macs_map[type] :
+        (type_macs_map[type] * 1e-3) / type_time_map[type], 3));
    tuple.push_back(IntToString(type_called_times_map[type]));
    data.emplace_back(tuple);
  }
  return mace::string_util::StringFormatter::Table(title, header, data);
 }

+
+std::string OpStat::StatByMACs() const {
+  if (records_.empty()) {
+    return "";
+  }
+  const int64_t round = total_time_.round();
+  int64_t count = 0;
+  for (auto &record : records_) {
+    count += record.second.macs;
+  }
+
+  std::string title = "Stat by MACs(Multiply-Accumulation)";
+  const std::vector<std::string> header = {
+      "total", "round", "first(G/s)", "avg(G/s)", "std"
+  };
+
+  std::vector<std::vector<std::string>> data;
+  std::vector<std::string> tuple;
+  tuple.push_back(IntToString(count));
+  tuple.push_back(IntToString(round));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3));
+  tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3));
+  tuple.push_back(FloatToString(total_time_.std_deviation(), 3));
+  data.emplace_back(tuple);
+  return mace::string_util::StringFormatter::Table(title, header, data);
+}
+
 std::string OpStat::Summary() const {
  std::stringstream stream;
  if (!records_.empty()) {
@@ -252,9 +331,11 @@ void OpStat::PrintStat() const {
    stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl;
    // top-10 op stat by time
    stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl;
-    // op stat by node type
-    stream << StatByNodeType() << std::endl;
+    // op stat by op type
+    stream << StatByOpType() << std::endl;
  }
+  // print MACs statistics
+  stream << StatByMACs();
  // Print summary
  stream << Summary();


--- a/mace/benchmark/statistics.h
+++ b/mace/benchmark/statistics.h
@@ -19,6 +19,7 @@
 #include <cmath>
 #include <iomanip>
 #include <limits>
+#include <locale>
 #include <map>
 #include <sstream>
 #include <string>
@@ -33,11 +34,33 @@ class RunMetadata;

 namespace benchmark {

+// stat the number of multiply-accumulate(MAC)
+int64_t StatMACs(const std::string &op_type,
+                 const std::vector<int64_t> &filter_shape,
+                 const std::vector<int64_t> &output_shape);
+
 template <typename IntType>
 std::string IntToString(const IntType v) {
  std::stringstream stream;
  stream << v;
-  return stream.str();
+  std::string src_str = stream.str();
+  size_t size = src_str.size();
+  size_t dst_size = size + ((size-1) / 3);
+  if (src_str[0] == '-') {
+    dst_size = size + ((size-2) / 3);
+  }
+  std::string result(dst_size, ',');
+  size_t dst_idx = dst_size - 1;
+  for (size_t src_idx = 0; src_idx < size; ++src_idx) {
+    if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) {
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    } else {
+      dst_idx -= 1;
+      result[dst_idx] = src_str[size - 1 - src_idx];
+    }
+    dst_idx -= 1;
+  }
+  return result;
 }

 template <typename FloatType>
@@ -127,7 +150,7 @@ enum Metric {
  COMPUTATION_TIME,
 };

-class OpStat{
+class OpStat {
 public:
  void StatMetadata(const RunMetadata &meta_data);

@@ -136,7 +159,8 @@ class OpStat{
 private:
  std::string StatByMetric(const Metric metric,
      const int top_limit) const;
-  std::string StatByNodeType() const;
+  std::string StatByOpType() const;
+  std::string StatByMACs() const;
  std::string Summary() const;

 private:
@@ -145,6 +169,7 @@ class OpStat{
    std::string type;
    std::vector<std::vector<int64_t>> output_shape;
    ConvPoolArgs args;
+    int64_t macs;
    int64_t order;
    TimeInfo<int64_t> start;
    TimeInfo<int64_t> rel_end;

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
      std::string type = op->debug_def().type();

      if (type.compare("Conv2D") == 0 ||
-          type.compare("FusedConv2D") == 0 ||
+          type.compare("Deconv2D") == 0 ||
          type.compare("DepthwiseConv2d") == 0 ||
+          type.compare("DepthwiseDeconv2d") == 0 ||
          type.compare("Pooling") == 0) {
        strides = op->GetRepeatedArgs<int>("strides");
        padding_type = op->GetOptionalArg<int>("padding", -1);
@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
        } else {
          kernels = op->Input(1)->shape();
        }
+      } else if (type.compare("MatMul") == 0) {
+        bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
+        kernels = op->Input(0)->shape();
+        if (transpose_a) {
+          std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
+        }
+      } else if (type.compare("FullyConnected") == 0) {
+        kernels = op->Input(1)->shape();
      }

      std::vector<std::vector<int64_t>> output_shapes;

--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -28,7 +28,7 @@ namespace testing {

 static std::vector<Benchmark *> *all_benchmarks = nullptr;
 static int64_t bytes_processed;
-static int64_t macc_processed;
+static int64_t macs_processed = 0;
 static int64_t accum_time = 0;
 static int64_t start_time = 0;

@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) {
  // Internal perf regression tools depends on the output formatting,
  // please keep in consistent when modifying
  printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
-         "Iterations", "Input(MB/s)", "MACC(G/s)");
-  printf("%s\n", std::string(width + 44, '-').c_str());
+         "Iterations", "Input(MB/s)", "GMACPS");
+  printf("%s\n", std::string(width + 45, '-').c_str());
  for (auto b : *all_benchmarks) {
    if (!std::regex_match(b->name_, match, regex)) continue;
    int iters;
@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) {
    b->Run(&iters, &seconds);
    float mbps = (bytes_processed * 1e-6) / seconds;
    // MACCs or other computations
-    float gmaccs = (macc_processed * 1e-9) / seconds;
+    float gmacs = (macs_processed * 1e-9) / seconds;
    printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(),
-           seconds * 1e9 / iters, iters, mbps, gmaccs);
+           seconds * 1e9 / iters, iters, mbps, gmacs);
  }
 }

@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
  int64_t iters = kMinIters;
  while (true) {
    bytes_processed = -1;
-    macc_processed = -1;
+    macs_processed = 0;
    RestartTiming();
    (*benchmark_func_)(iters);
    StopTiming();
@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
 }

 void BytesProcessed(int64_t n) { bytes_processed = n; }
-void MaccProcessed(int64_t n) { macc_processed = n; }
+void MacsProcessed(int64_t n) { macs_processed = n; }
 void RestartTiming() {
  accum_time = 0;
  start_time = NowMicros();

--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,7 +42,7 @@ class Benchmark {
 };

 void BytesProcessed(int64_t);
-void MaccProcessed(int64_t);
+void MacsProcessed(int64_t);
 void RestartTiming();
 void StartTiming();
 void StopTiming();

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -230,6 +230,7 @@ cc_test(
    linkstatic = 1,
    deps = [
        "test",
+        "//mace/benchmark:statistics",
        "//mace/core:test_benchmark_main",
        "//third_party/eigen3",
    ],

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
  static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
      int iters) {                                                           \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                          \
@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
  static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
      int iters) {                                                            \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                           \
@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
  static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
      int iters) {                                                            \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                           \
@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
  static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(        \
      int iters) {                                                           \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                          \
@@ -292,7 +288,6 @@ void SigmoidBenchmark(
  static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
      int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
    SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
  }                                                                       \

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
      MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(      \
          int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                   \
  }                                                                           \

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -75,7 +75,7 @@ void BatchNorm(
  static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
      int iters) {                                                          \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;        \
-    mace::testing::MaccProcessed(tot);                                      \
+    mace::testing::MacsProcessed(tot);                                      \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
    BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                             \
  }                                                                         \

--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -58,7 +58,6 @@ void BMBatchToSpace(
      MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
          int iters) {                                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                      \
  }                                                                            \

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
      int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
    BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                             \
  }                                                                       \

--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters,
  static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W);            \
  }                                                                  \

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -61,7 +61,6 @@ void ChannelShuffle(
      MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
          int iters) {                                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G);                        \
  }                                                                            \

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) {
    net.Run();
  }
  const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2;
-  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters,
  const int64_t tot =
      static_cast<int64_t>(iters) *
      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,6 +14,7 @@

 #include <algorithm>

+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters,
        (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1;        \
    int64_t ow =                                                              \
        (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1;        \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC});                      \
+    mace::testing::MacsProcessed(macs);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION,         \
                         mace::Padding::P, OC);                               \

--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) {
    net.RunOp(D);
  }
  const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
-  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters,
  const int64_t tot =
      static_cast<int64_t>(iters) *
      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {

--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,6 +14,7 @@

 #include <algorithm>

+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -90,9 +91,10 @@ static void Deconv2d(int iters,
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
    int64_t oh = OH;                                                          \
    int64_t ow = OW;                                                          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC});                    \
+    mace::testing::MacsProcessed(macs);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW,         \
                         mace::Padding::P, OC);                               \

--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -62,7 +62,6 @@ void DepthToSpace(
      MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
          int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
  }                                                                           \

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,6 +14,7 @@

 #include <algorithm>

+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters,
        (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;         \
    int64_t ow =                                                               \
        (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;         \
-    const int64_t macc =                                                       \
-        static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    const int64_t macs =                                                       \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(               \
+            "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C});                \
+    mace::testing::MacsProcessed(macs);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,           \
                                  mace::Padding::P, M);                        \

--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -14,6 +14,7 @@

 #include <algorithm>

+#include "mace/benchmark/statistics.h"
 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters,
        ##_##TYPE##_##DEVICE(                                                 \
          int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * H * W * KH * KW * C;   \
-    mace::testing::MaccProcessed(macc);                                       \
+    const int64_t macs =                                                      \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(              \
+            "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C});               \
+    mace::testing::MacsProcessed(macs);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
-    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);        \
+    DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P);         \
  }                                                                           \
  MACE_BENCHMARK(                                                             \
    MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -66,7 +66,6 @@ void EltwiseBenchmark(
      MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
          int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    EltwiseBenchmark<DEVICE, TYPE>(                                           \
        iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C);      \

--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -14,6 +14,7 @@

 #include <string>

+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>(
 #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
  static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
      int iters) {                                                         \
-    const int64_t macc =                                                   \
-        static_cast<int64_t>(iters) * N * C * H * W * OC + OC;             \
+    const int64_t macs =                                                   \
+        static_cast<int64_t>(iters) * mace::benchmark::StatMACs(           \
+            "FullyConnected", {OC, H, W, C}, {N, 1, 1, OC});               \
    const int64_t tot =                                                    \
        static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC;           \
-    mace::testing::MaccProcessed(macc);                                    \
+    mace::testing::MacsProcessed(macs);                                    \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
    FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);                      \
  }                                                                        \

--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
@@ -66,7 +66,6 @@ void GatherBenchmark(int iters,
      MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \
          int iters) {                                                    \
    const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED;    \
-    mace::testing::MaccProcessed(0);                                      \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
    GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED);             \
  }                                                                       \

--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -59,7 +59,6 @@ static void LocalResponseNorm(
      MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(   \
          int iters) {                                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
  }                                                                            \

--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
  static void                                                                  \
      MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\
        int iters) {                                                           \
-    const int64_t macc =                                                       \
+    const int64_t macs =                                                       \
        static_cast<int64_t>(                                                  \
            iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS;       \
    const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE;          \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
    mace::testing::BytesProcessed(tot * (sizeof(TYPE)));                       \
    LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS);                \
  }                                                                            \

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -19,6 +19,7 @@
 #include <vector>

 #include "public/gemmlowp.h"
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/gemm.h"
 #include "mace/ops/sgemm.h"
@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {

 #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE)                   \
  static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * M * K * N;  \
+    const int64_t macs = static_cast<int64_t>(iters) *             \
+        mace::benchmark::StatMACs("MatMul", {K}, {M, N});          \
    const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
-    mace::testing::MaccProcessed(macc);                            \
+    mace::testing::MacsProcessed(macs);                            \
    mace::testing::BytesProcessed(tot * sizeof(TYPE));             \
    MatmulBenchmark_##FUNC(iters, M, K, N);                        \
  }                                                                \
@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                         \
  static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(        \
      int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                          \
  }                                                                            \
@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark(
 #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE)               \
  static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(    \
      int iters) {                                                             \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("MatMul", {C}, {N, H, W});                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                 \
  }                                                                            \

--- a/mace/ops/memory_benchmark.cc
+++ b/mace/ops/memory_benchmark.cc
@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW(
  static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
-    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot * sizeof(float));              \
    MemoryAccessBenchmark_##ORDER(iters, N, H, W, C);                \
  }                                                                  \

--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height,
  static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
      int iters) {                                                           \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD);                               \
  }                                                                          \

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -81,7 +81,6 @@ void Pooling(int iters,
        ##TYPE##_##DEVICE(                                                     \
          int iters) {                                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,          \
                    PoolingType::PO);                                          \

--- a/mace/ops/quantize_benchmark.cc
+++ b/mace/ops/quantize_benchmark.cc
@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) {
    MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE(              \
      int iters) {                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
    Quantize<DEVICE, TYPE>(iters, N);                      \
  }                                                        \
@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) {
    MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE(            \
      int iters) {                                         \
    const int64_t tot = static_cast<int64_t>(iters) * N;   \
-    mace::testing::MaccProcessed(tot);                     \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
    Dequantize<DEVICE, TYPE>(iters, N);                    \
  }                                                        \

--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels,
    MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    Reduce<DEVICE, TYPE>(iters, N, C, H, W);        \
  }                                                                  \

--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters,
      MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
        ##DEVICE(                                                             \
          int iters) {                                                        \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;   \
+    const int64_t macs = static_cast<int64_t>(iters) *                        \
+        mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C});       \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;        \
-    mace::testing::MaccProcessed(macc);                                       \
+    mace::testing::MacsProcessed(macs);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
  }                                                                           \

--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include <string>
+
+#include "mace/benchmark/statistics.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"

@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters,
      MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
        ##DEVICE(                                                              \
          int iters) {                                                         \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;    \
+    const int64_t macs = static_cast<int64_t>(iters) *                         \
+        mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C});       \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;         \
-    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::MacsProcessed(macs);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);        \
  }                                                                            \

--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) {
 #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE)                   \
  static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
      int iters) {                                                        \
-    const int64_t macc =                                                  \
+    const int64_t macs =                                                  \
        static_cast<int64_t>(iters) * N * C * H * W;                      \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(macc);                                   \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
    Reverse<DEVICE, TYPE>(iters, N, C, H, W);                             \
  }                                                                       \

--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>(
  static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
      int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
    SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                    \
  }                                                                       \

--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -64,7 +64,6 @@ void BMSpaceToBatch(
    MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
        int iters) {                                                           \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
    BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                    \
  }                                                                            \

--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -62,7 +62,6 @@ void SpaceToDepth(
      MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
          int iters) {                                                        \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G);                         \
  }                                                                           \

--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -65,7 +65,7 @@ void BMSplitHelper(int iters,
      MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(        \
          int iters) {                                                       \
        const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-        mace::testing::MaccProcessed(tot);                                   \
+        mace::testing::MacsProcessed(tot);                                   \
        mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
        BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO);                \
      }                                                                      \

--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels,
    MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W);        \
  }                                                                  \

--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters,
  static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE(     \
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * H * W;         \
-    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0});         \
  }                                                                  \
@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters,
    MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
      DEVICE(int iters) {                                                     \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3});  \
  }                                                                           \

--- a/mace/test/BUILD
+++ b/mace/test/BUILD
@@ -7,10 +7,10 @@ licenses(["notice"])  # Apache 2.0
 load(
    "//mace:mace.bzl",
    "if_android",
-    "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
    "if_neon_enabled",
+    "if_openmp_enabled",
+    "if_android_armv7",
+    "if_hexagon_enabled",
    "if_opencl_enabled",
    "if_quantize_enabled",
 )
@@ -32,16 +32,19 @@ cc_test(
        "-Wextra",
        "-Wno-missing-field-initializers",
    ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
    ]) + if_quantize_enabled([
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,
@@ -62,16 +65,19 @@ cc_test(
        "-Wextra",
        "-Wno-missing-field-initializers",
    ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
    ]) + if_quantize_enabled([
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,
@@ -92,16 +98,19 @@ cc_test(
        "-Wextra",
        "-Wno-missing-field-initializers",
    ] + if_openmp_enabled([
-        "-fopenmp",
-        "-DMACE_ENABLE_OPENMP",
+        "-fopenmp"
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+    ]) + if_android_armv7([
+        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
    ]) + if_quantize_enabled([
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
    ]),
    linkopts = ["-fopenmp"],
    linkstatic = 1,

--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
        if len(parts) == 5 and parts[0].startswith("BM_"):
            metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
            metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
-            metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
+            metrics["%s.gmac_per_sec" % parts[0]] = parts[4]

    # platform = dev[YAMLKeyword.target_socs]
    # model = dev[YAMLKeyword.device_name]