提交 5f021151 编写于 作者: L liuqi

Add MACs statistics for model benchmark tool and related docs

上级 b31a29a3
......@@ -9,6 +9,7 @@ stages:
- api_test
- python_tools_tests
- model_tests
- quantization_tests
- build_android_demo
- ops_benchmark
- extra_tests
......@@ -62,6 +63,14 @@ api_test:
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
ops_benchmark:
stage: ops_benchmark
......@@ -103,7 +112,7 @@ ndk_versions_compatible_tests:
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- >
for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
do
new_ndk_path=${prefix_path}${ndk};
if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
......@@ -111,8 +120,12 @@ ndk_versions_compatible_tests:
export PATH=$ANDROID_NDK_HOME:$PATH;
echo "ndk path: $ANDROID_NDK_HOME";
if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
fi
done
- export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
......@@ -131,9 +144,9 @@ python_tools_tests:
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- >
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
model_tests:
stage: model_tests
......@@ -142,23 +155,39 @@ model_tests:
- rm -rf mace-models
- rm -rf generic-mobile-devices
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
- >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- >
for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
do
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
done
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
- >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- rm -rf mace-models
quantization_tests:
stage: quantization_tests
script:
- pwd
- rm -rf mace-models
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml
- >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- rm -rf mace-models
build_android_demo:
......
......@@ -27,6 +27,7 @@ The main documentation is organized into the following sections:
user_guide/basic_usage
user_guide/advanced_usage
user_guide/benchmark
user_guide/op_lists
user_guide/quantization_usage
......
......@@ -379,6 +379,8 @@ Useful Commands
* **benchmark and profile model**
the detailed information is in :doc:`benchmark`.
.. code:: sh
# Benchmark model, get detailed statistics of each Op.
......
......@@ -227,7 +227,7 @@ to run and validate your model.
* **benchmark**
benchmark and profile the model.
benchmark and profile the model. the details are in :doc:`benchmark`.
.. code:: sh
......
Benchmark usage
===============
This part contains the usage of MACE benchmark tools.
Overview
--------
As mentioned in the previous part, there are two kinds of benchmark tools,
one for operator and the other for model.
Operator Benchmark
------------------
Operator Benchmark is used for test and optimize the performance of specific operator.
=====
Usage
=====
.. code:: bash
python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True --args="--filter=.*BM_CONV.*"
======
Output
======
.. code:: bash
Benchmark Time(ns) Iterations Input(MB/s) GMACPS
------------------------------------------------------------------------------------------------------
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU 1759129 479 114.09 29.21
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU 4031301 226 49.79 12.75
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU 3996357 266 25.11 12.86
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU 914994 1093 54.84 56.15
===========
Explanation
===========
.. list-table::
:header-rows: 1
* - Options
- Usage
* - Benchmark
- Benchmark unit name.
* - Time
- Time of one round.
* - Iterations
- the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s.
* - Input
- The bandwidth of dealing with input. the unit is MB/s.
* - GMACPS
- The speed of running MACs(multiply-accumulation). the unit is G/s.
Model Benchmark
---------------
Model Benchmark is used for test and optimize the performance of your model.
This tool could record the running time of the model and the detailed running information of each operator of your model.
=====
Usage
=====
.. code:: bash
python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
======
Output
======
.. code:: bash
I benchmark_model.cc:158 ---------------------------------------------------------------------
I benchmark_model.cc:158 Warm Up
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158 | 1 | 51.481 | 51.481 | 51.481 | 51.481 | 51.481 | 0.000 |
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 Run without statistics
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158 | 100 | 30.272 | 31.390 | 29.938 | 45.966 | 30.913 | 1850.983 |
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158
I benchmark_model.cc:158 -----------------------------------------------------------------------
I benchmark_model.cc:158 Run with statistics
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 | 100 | 32.358 | 33.327 | 32.293 | 33.607 | 33.002 | 310.435 |
I benchmark_model.cc:158 ------------------------------------------------------------------------
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 Sort by Run Order
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name |
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Transpose | 0.000 | 0.102 | 0.100 | 0.315 | 0.315 | 0.000 | | | | [1,3,224,224] | | input |
I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 5.258 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
I statistics.cc:343 | DepthwiseConv2d | 1.724 | 0.936 | 0.944 | 2.972 | 8.230 | 3.827 | [1,1] | SAME | [1,32,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 |
I statistics.cc:343 | Softmax | 32.835 | 0.039 | 0.042 | 0.131 | 99.996 | 0.000 | | | | [1,1001] | | MobilenetV1/Predictions/Softmax |
I statistics.cc:343 | Identity | 32.880 | 0.001 | 0.001 | 0.004 | 100.000 | 0.000 | | | | [1,1001] | | mace_output_node_MobilenetV1/Predictions/Reshape_1 |
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 Sort by Computation Time
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name |
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Conv2D | 30.093 | 2.102 | 2.198 | 6.922 | 6.922 | 23.372 | [1,1] | SAME | [1024,1024,1,1] | [1,1024,7,7] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 7.823 | 2.115 | 2.164 | 6.813 | 13.735 | 23.747 | [1,1] | SAME | [128,128,1,1] | [1,128,56,56] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 15.859 | 2.119 | 2.109 | 6.642 | 20.377 | 24.358 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 23.619 | 2.087 | 2.096 | 6.599 | 26.976 | 24.517 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 26.204 | 2.081 | 2.093 | 6.590 | 33.567 | 24.549 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 21.038 | 2.036 | 2.091 | 6.585 | 40.152 | 24.569 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 18.465 | 2.034 | 2.082 | 6.554 | 46.706 | 24.684 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 2.709 | 1.984 | 2.058 | 6.482 | 53.188 | 12.480 | [1,1] | SAME | [64,32,1,1] | [1,64,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 12.220 | 1.788 | 1.901 | 5.986 | 59.174 | 27.027 | [1,1] | SAME | [256,256,1,1] | [1,256,28,28] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 64.117 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 Stat by Op Type
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Count | Avg(ms) | % | cdf% | MACs | GMACPS | Called times |
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 | Conv2D | 15 | 24.978 | 78.693 | 78.693 | 551,355,392 | 22.074 | 15 |
I statistics.cc:343 | DepthwiseConv2d | 13 | 6.543 | 20.614 | 99.307 | 17,385,984 | 2.657 | 13 |
I statistics.cc:343 | Transpose | 1 | 0.100 | 0.315 | 99.622 | 0 | 0.000 | 1 |
I statistics.cc:343 | Pooling | 1 | 0.072 | 0.227 | 99.849 | 0 | 0.000 | 1 |
I statistics.cc:343 | Softmax | 1 | 0.041 | 0.129 | 99.978 | 0 | 0.000 | 1 |
I statistics.cc:343 | Squeeze | 1 | 0.006 | 0.019 | 99.997 | 0 | 0.000 | 1 |
I statistics.cc:343 | Identity | 1 | 0.001 | 0.003 | 100.000 | 0 | 0.000 | 1 |
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 Stat by MACs(Multiply-Accumulation)
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 | total | round | first(G/s) | avg(G/s) | std |
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 | 568,741,376 | 100 | 18.330 | 17.909 | 301.326 |
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 Summary of Ops' Stat
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 | 100 | 31.028 | 32.093 | 31.028 | 32.346 | 31.758 | 301.326 |
I statistics.cc:343 ------------------------------------------------------------------------
===========
Explanation
===========
There are 8 sections of the output information.
1. **Warm Up**
This section lists the time information of warm-up run.
The detailed explanation is list as below.
.. list-table::
:header-rows: 1
* - Key
- Explanation
* - round
- the number of round has been run.
* - first
- the run time of first round. unit is millisecond.
* - curr
- the run time of last round. unit is millisecond.
* - min
- the minimal run time of all rounds. unit is millisecond.
* - max
- the maximal run time of all rounds. unit is millisecond.
* - avg
- the average run time of all rounds. unit is millisecond.
* - std
- the standard deviation of all rounds.
2. **Run without statistics**
This section lists the run time information without statistics code.
the detailed explanation is the same as the section of Warm Up.
3. **Run with statistics**
This section lists the run time information with statistics code,
the time maybe longer compared with the second section.
the detailed explanation is the same as the section of Warm Up.
4. **Sort by Run Order**
This section lists the detailed run information of every operator in your model.
The operators is listed based on the run order, Every line is an operator of your model.
The detailed explanation is list as below.
.. list-table::
:header-rows: 1
* - Key
- Explanation
* - Op Type
- the type of operator.
* - Start
- the start time of the operator. unit is millisecond.
* - First
- the run time of first round. unit is millisecond.
* - Avg
- the average run time of all rounds. unit is millisecond.
* - %
- the percentage of total running time.
* - cdf%
- the cumulative percentage of running time.
* - GMACPS
- The number of run MACs(multiply-accumulation) per second. the unit is G/s.
* - Stride
- the stride parameter of the operator if exist.
* - Pad
- the pad parameter of the operator if exist.
* - Filter Shape
- the filter shape of the operator if exist.
* - Output Shape
- the output shape of the operator.
* - Dilation
- the dilation parameter of the operator if exist.
* - Name
- the name of the operator.
5. **Sort by Computation time**
This section lists the top-10 most time-consuming operators.
The operators is listed based on the computation time,
the detailed explanation is the same as previous section.
6. **Stat by Op Type**
This section stats the run information about operators based on operator type.
.. list-table::
:header-rows: 1
* - Op Type
- the type of operator.
* - Count
- the number of operators with the type.
* - Avg
- the average run time of the operator. unit is millisecond.
* - %
- the percentage of total running time.
* - cdf%
- the cumulative percentage of running time.
* - MACs
- The number of MACs(multiply-accumulation).
* - GMACPS
- The number of MACs(multiply-accumulation) runs per second. the unit is G/s.
* - Called times
- the number of called times in all rounds.
7. **Stat by MACs**
This section stats the MACs information of your model.
.. list-table::
:header-rows: 1
* - total
- the number of MACs of your model.
* - round
- the number of round has been run.
* - First
- the GMAPS of first round. unit is G/s.
* - Avg
- the average GMAPS of all rounds. unit is G/s.
* - std
- the standard deviation of all rounds.
8. **Summary of Ops' Stat**
This section lists the run time information which is summation of every operator's run time.
which may be shorter than the model's run time with statistics.
the detailed explanation is the same as the section of Warm Up.
......@@ -15,6 +15,7 @@ cc_library(
srcs = ["statistics.cc"],
hdrs = ["statistics.h"],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
visibility = ["//visibility:public"],
deps = [
"//mace/utils",
],
......
......@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) {
return result;
}
bool SplitAndParseToInts(const std::string &str,
char delims,
std::vector<int64_t> *result) {
std::string tmp = str;
while (!tmp.empty()) {
int64_t dim = atoi(tmp.data());
result->push_back(dim);
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return true;
}
} // namespace str_util
void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <algorithm>
#include <functional>
#include <set>
#include "mace/benchmark/statistics.h"
......@@ -53,7 +54,6 @@ std::string ShapeToString(
if (output_shape.empty()) {
return "";
}
std::stringstream stream;
stream << "[";
for (size_t i = 0; i < output_shape.size(); ++i) {
......@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) {
} // namespace
int64_t StatMACs(const std::string &op_type,
const std::vector<int64_t> &filter_shape,
const std::vector<int64_t> &output_shape) {
int64_t macs = 0;
if (op_type == "Conv2D" || op_type == "Deconv2D") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3]
* filter_shape[2] * filter_shape[3] * filter_shape[1];
} else if (op_type == "MatMul") {
macs = std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>())
* filter_shape.back();
} else if (op_type == "DepthwiseConv2d") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3];
} else if (op_type == "DepthwiseDeconv2d") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3] * filter_shape[2] * filter_shape[3];
} else if (op_type == "FullyConnected") {
macs = output_shape[0] * std::accumulate(filter_shape.begin(),
filter_shape.end(),
1,
std::multiplies<int64_t>());
} else if (op_type == "BatchNorm") {
macs = std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>());
} else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") {
macs = 3 * std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>());
}
return macs;
}
void OpStat::StatMetadata(const RunMetadata &meta_data) {
if (meta_data.op_stats.empty()) {
LOG(FATAL) << "Op metadata should not be empty";
......@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) {
record->type = op_stat.type;
record->args = op_stat.args;
record->output_shape = op_stat.output_shape;
record->macs =
StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]);
record->order = order_idx;
order_idx += 1;
}
......@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric,
// generate string
std::string title = "Sort by " + MetricToString(metric);
const std::vector<std::string> header = {
"Node Type", "Start", "First", "Avg(ms)", "%", "cdf%",
"Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS",
"Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name"
};
std::vector<std::vector<std::string>> data;
......@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric,
FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3));
tuple.push_back(
FloatToString(accumulate_time * 100.f / total_time_.sum(), 3));
tuple.push_back(FloatToString(
record.macs < 1e-6 ? record.macs :
(record.macs * 1e-3) / record.rel_end.avg(), 3));
tuple.push_back(VectorToString<int>(record.args.strides));
if (record.args.padding_type != -1) {
tuple.push_back(PaddingTypeToString(record.args.padding_type));
......@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric,
return mace::string_util::StringFormatter::Table(title, header, data);
}
std::string OpStat::StatByNodeType() const {
std::string OpStat::StatByOpType() const {
if (records_.empty()) {
return "";
}
const int64_t round = total_time_.round();
int64_t total_time = 0;
std::map<std::string, int64_t> type_time_map;
std::map<std::string, int64_t> type_macs_map;
std::map<std::string, int64_t> type_count_map;
std::map<std::string, int64_t> type_called_times_map;
std::set<std::string> node_types_set;
std::set<std::string> op_types_set;
for (auto &record : records_) {
std::string node_type = record.second.type;
node_types_set.insert(node_type);
std::string op_type = record.second.type;
op_types_set.insert(op_type);
type_time_map[node_type] += record.second.rel_end.sum() / round;
type_time_map[op_type] += record.second.rel_end.sum() / round;
type_macs_map[op_type] += record.second.macs;
total_time += record.second.rel_end.sum() / round;
type_count_map[node_type] += 1;
type_called_times_map[node_type] += record.second.called_times / round;
type_count_map[op_type] += 1;
type_called_times_map[op_type] += record.second.called_times / round;
}
std::vector<std::string> node_types(node_types_set.begin(),
node_types_set.end());
std::sort(node_types.begin(), node_types.end(),
std::vector<std::string> op_types(op_types_set.begin(),
op_types_set.end());
std::sort(op_types.begin(), op_types.end(),
[&](const std::string &lhs, const std::string &rhs) {
return type_time_map[lhs] > type_time_map[rhs];
});
std::string title = "Stat by node type";
std::string title = "Stat by Op Type";
const std::vector<std::string> header = {
"Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times"
"Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs",
"GMACPS", "Called times"
};
float cdf = 0.0f;
std::vector<std::vector<std::string>> data;
for (auto type : node_types) {
for (auto type : op_types) {
const float avg_time = type_time_map[type] / 1000.0f;
const float percentage = type_time_map[type] * 100.0f / total_time;
cdf += percentage;
......@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const {
tuple.push_back(FloatToString(avg_time, 3));
tuple.push_back(FloatToString(percentage, 3));
tuple.push_back(FloatToString(cdf, 3));
tuple.push_back(IntToString(type_macs_map[type]));
tuple.push_back(FloatToString(
type_macs_map[type] < 1e-6 ? type_macs_map[type] :
(type_macs_map[type] * 1e-3) / type_time_map[type], 3));
tuple.push_back(IntToString(type_called_times_map[type]));
data.emplace_back(tuple);
}
return mace::string_util::StringFormatter::Table(title, header, data);
}
std::string OpStat::StatByMACs() const {
if (records_.empty()) {
return "";
}
const int64_t round = total_time_.round();
int64_t count = 0;
for (auto &record : records_) {
count += record.second.macs;
}
std::string title = "Stat by MACs(Multiply-Accumulation)";
const std::vector<std::string> header = {
"total", "round", "first(G/s)", "avg(G/s)", "std"
};
std::vector<std::vector<std::string>> data;
std::vector<std::string> tuple;
tuple.push_back(IntToString(count));
tuple.push_back(IntToString(round));
tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3));
tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3));
tuple.push_back(FloatToString(total_time_.std_deviation(), 3));
data.emplace_back(tuple);
return mace::string_util::StringFormatter::Table(title, header, data);
}
std::string OpStat::Summary() const {
std::stringstream stream;
if (!records_.empty()) {
......@@ -252,9 +331,11 @@ void OpStat::PrintStat() const {
stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl;
// top-10 op stat by time
stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl;
// op stat by node type
stream << StatByNodeType() << std::endl;
// op stat by op type
stream << StatByOpType() << std::endl;
}
// print MACs statistics
stream << StatByMACs();
// Print summary
stream << Summary();
......
......@@ -19,6 +19,7 @@
#include <cmath>
#include <iomanip>
#include <limits>
#include <locale>
#include <map>
#include <sstream>
#include <string>
......@@ -33,11 +34,33 @@ class RunMetadata;
namespace benchmark {
// stat the number of multiply-accumulate(MAC)
int64_t StatMACs(const std::string &op_type,
const std::vector<int64_t> &filter_shape,
const std::vector<int64_t> &output_shape);
template <typename IntType>
std::string IntToString(const IntType v) {
std::stringstream stream;
stream << v;
return stream.str();
std::string src_str = stream.str();
size_t size = src_str.size();
size_t dst_size = size + ((size-1) / 3);
if (src_str[0] == '-') {
dst_size = size + ((size-2) / 3);
}
std::string result(dst_size, ',');
size_t dst_idx = dst_size - 1;
for (size_t src_idx = 0; src_idx < size; ++src_idx) {
if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) {
result[dst_idx] = src_str[size - 1 - src_idx];
} else {
dst_idx -= 1;
result[dst_idx] = src_str[size - 1 - src_idx];
}
dst_idx -= 1;
}
return result;
}
template <typename FloatType>
......@@ -127,7 +150,7 @@ enum Metric {
COMPUTATION_TIME,
};
class OpStat{
class OpStat {
public:
void StatMetadata(const RunMetadata &meta_data);
......@@ -136,7 +159,8 @@ class OpStat{
private:
std::string StatByMetric(const Metric metric,
const int top_limit) const;
std::string StatByNodeType() const;
std::string StatByOpType() const;
std::string StatByMACs() const;
std::string Summary() const;
private:
......@@ -145,6 +169,7 @@ class OpStat{
std::string type;
std::vector<std::vector<int64_t>> output_shape;
ConvPoolArgs args;
int64_t macs;
int64_t order;
TimeInfo<int64_t> start;
TimeInfo<int64_t> rel_end;
......
......@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
std::string type = op->debug_def().type();
if (type.compare("Conv2D") == 0 ||
type.compare("FusedConv2D") == 0 ||
type.compare("Deconv2D") == 0 ||
type.compare("DepthwiseConv2d") == 0 ||
type.compare("DepthwiseDeconv2d") == 0 ||
type.compare("Pooling") == 0) {
strides = op->GetRepeatedArgs<int>("strides");
padding_type = op->GetOptionalArg<int>("padding", -1);
......@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} else {
kernels = op->Input(1)->shape();
}
} else if (type.compare("MatMul") == 0) {
bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
kernels = op->Input(0)->shape();
if (transpose_a) {
std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
}
} else if (type.compare("FullyConnected") == 0) {
kernels = op->Input(1)->shape();
}
std::vector<std::vector<int64_t>> output_shapes;
......
......@@ -28,7 +28,7 @@ namespace testing {
static std::vector<Benchmark *> *all_benchmarks = nullptr;
static int64_t bytes_processed;
static int64_t macc_processed;
static int64_t macs_processed = 0;
static int64_t accum_time = 0;
static int64_t start_time = 0;
......@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) {
// Internal perf regression tools depends on the output formatting,
// please keep in consistent when modifying
printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
"Iterations", "Input(MB/s)", "MACC(G/s)");
printf("%s\n", std::string(width + 44, '-').c_str());
"Iterations", "Input(MB/s)", "GMACPS");
printf("%s\n", std::string(width + 45, '-').c_str());
for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue;
int iters;
......@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) {
b->Run(&iters, &seconds);
float mbps = (bytes_processed * 1e-6) / seconds;
// MACCs or other computations
float gmaccs = (macc_processed * 1e-9) / seconds;
float gmacs = (macs_processed * 1e-9) / seconds;
printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(),
seconds * 1e9 / iters, iters, mbps, gmaccs);
seconds * 1e9 / iters, iters, mbps, gmacs);
}
}
......@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
int64_t iters = kMinIters;
while (true) {
bytes_processed = -1;
macc_processed = -1;
macs_processed = 0;
RestartTiming();
(*benchmark_func_)(iters);
StopTiming();
......@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
}
void BytesProcessed(int64_t n) { bytes_processed = n; }
void MaccProcessed(int64_t n) { macc_processed = n; }
void MacsProcessed(int64_t n) { macs_processed = n; }
void RestartTiming() {
accum_time = 0;
start_time = NowMicros();
......
......@@ -42,7 +42,7 @@ class Benchmark {
};
void BytesProcessed(int64_t);
void MaccProcessed(int64_t);
void MacsProcessed(int64_t);
void RestartTiming();
void StartTiming();
void StopTiming();
......
......@@ -230,6 +230,7 @@ cc_test(
linkstatic = 1,
deps = [
"test",
"//mace/benchmark:statistics",
"//mace/core:test_benchmark_main",
"//third_party/eigen3",
],
......
......@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......@@ -292,7 +288,6 @@ void SigmoidBenchmark(
static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
......
......@@ -75,7 +75,7 @@ void BatchNorm(
static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::MacsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -58,7 +58,6 @@ void BMBatchToSpace(
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
......
......@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters,
static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \
} \
......
......@@ -61,7 +61,6 @@ void ChannelShuffle(
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
......
......@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) {
net.Run();
}
const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2;
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters,
const int64_t tot =
static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......
......@@ -14,6 +14,7 @@
#include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
......@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters,
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
const int64_t macs = \
static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
......
......@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) {
net.RunOp(D);
}
const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters,
const int64_t tot =
static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......
......@@ -14,6 +14,7 @@
#include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
......@@ -90,9 +91,10 @@ static void Deconv2d(int iters,
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t oh = OH; \
int64_t ow = OW; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
const int64_t macs = \
static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \
mace::Padding::P, OC); \
......
......@@ -62,7 +62,6 @@ void DepthToSpace(
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
......
......@@ -14,6 +14,7 @@
#include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
......@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters,
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
const int64_t macs = \
static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
......
......@@ -14,6 +14,7 @@
#include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
......@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters,
##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * H * W * KH * KW * C; \
mace::testing::MaccProcessed(macc); \
const int64_t macs = \
static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P); \
DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P); \
} \
MACE_BENCHMARK( \
MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
......
......@@ -66,7 +66,6 @@ void EltwiseBenchmark(
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
......@@ -14,6 +14,7 @@
#include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
......@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>(
#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t macs = \
static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"FullyConnected", {OC, H, W, C}, {N, 1, 1, OC}); \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
......
......@@ -66,7 +66,6 @@ void GatherBenchmark(int iters,
MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED; \
mace::testing::MaccProcessed(0); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED); \
} \
......
......@@ -59,7 +59,6 @@ static void LocalResponseNorm(
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
static void \
MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t macc = \
const int64_t macs = \
static_cast<int64_t>( \
iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS; \
const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE; \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS); \
} \
......
......@@ -19,6 +19,7 @@
#include <vector>
#include "public/gemmlowp.h"
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/gemm.h"
#include "mace/ops/sgemm.h"
......@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE) \
static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * M * K * N; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {K}, {M, N}); \
const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot * sizeof(TYPE)); \
MatmulBenchmark_##FUNC(iters, M, K, N); \
} \
......@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark(
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
......@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark(
#define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
......
......@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW(
static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot * sizeof(float)); \
MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \
} \
......
......@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height,
static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \
......
......@@ -81,7 +81,6 @@ void Pooling(int iters,
##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
......
......@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) {
MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Quantize<DEVICE, TYPE>(iters, N); \
} \
......@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) {
MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Dequantize<DEVICE, TYPE>(iters, N); \
} \
......
......@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels,
MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Reduce<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
......@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters,
MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C}); \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
......@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters,
MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C}); \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
......
......@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) {
#define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
const int64_t macs = \
static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Reverse<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>(
static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -64,7 +64,6 @@ void BMSpaceToBatch(
MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
......
......@@ -62,7 +62,6 @@ void SpaceToDepth(
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
......
......@@ -65,7 +65,7 @@ void BMSplitHelper(int iters,
MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::MacsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \
} \
......
......@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels,
MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters,
static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \
} \
......@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters,
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \
......
......@@ -7,10 +7,10 @@ licenses(["notice"]) # Apache 2.0
load(
"//mace:mace.bzl",
"if_android",
"if_hexagon_enabled",
"if_not_hexagon_enabled",
"if_openmp_enabled",
"if_neon_enabled",
"if_openmp_enabled",
"if_android_armv7",
"if_hexagon_enabled",
"if_opencl_enabled",
"if_quantize_enabled",
)
......@@ -32,16 +32,19 @@ cc_test(
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
"-DMACE_ENABLE_OPENMP",
"-fopenmp"
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
......@@ -62,16 +65,19 @@ cc_test(
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
"-DMACE_ENABLE_OPENMP",
"-fopenmp"
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
......@@ -92,16 +98,19 @@ cc_test(
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
"-DMACE_ENABLE_OPENMP",
"-fopenmp"
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]),
linkopts = ["-fopenmp"],
linkstatic = 1,
......
......@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
metrics["%s.gmac_per_sec" % parts[0]] = parts[4]
# platform = dev[YAMLKeyword.target_socs]
# model = dev[YAMLKeyword.device_name]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册