提交 021dbc5b 编写于 作者: 叶剑武

Merge branch 'MACs-bm' into 'master'

Add MACs statistics for model benchmark tool and related docs.

See merge request !889
...@@ -9,6 +9,7 @@ stages: ...@@ -9,6 +9,7 @@ stages:
- api_test - api_test
- python_tools_tests - python_tools_tests
- model_tests - model_tests
- quantization_tests
- build_android_demo - build_android_demo
- ops_benchmark - ops_benchmark
- extra_tests - extra_tests
...@@ -62,6 +63,14 @@ api_test: ...@@ -62,6 +63,14 @@ api_test:
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
- >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS
ops_benchmark: ops_benchmark:
stage: ops_benchmark stage: ops_benchmark
...@@ -103,7 +112,7 @@ ndk_versions_compatible_tests: ...@@ -103,7 +112,7 @@ ndk_versions_compatible_tests:
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi fi
- > - >
for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b; for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
do do
new_ndk_path=${prefix_path}${ndk}; new_ndk_path=${prefix_path}${ndk};
if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
...@@ -111,8 +120,12 @@ ndk_versions_compatible_tests: ...@@ -111,8 +120,12 @@ ndk_versions_compatible_tests:
export PATH=$ANDROID_NDK_HOME:$PATH; export PATH=$ANDROID_NDK_HOME:$PATH;
echo "ndk path: $ANDROID_NDK_HOME"; echo "ndk path: $ANDROID_NDK_HOME";
if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
fi fi
done done
- export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
...@@ -131,9 +144,9 @@ python_tools_tests: ...@@ -131,9 +144,9 @@ python_tools_tests:
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi fi
- > - >
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
model_tests: model_tests:
stage: model_tests stage: model_tests
...@@ -142,23 +155,39 @@ model_tests: ...@@ -142,23 +155,39 @@ model_tests:
- rm -rf mace-models - rm -rf mace-models
- rm -rf generic-mobile-devices - rm -rf generic-mobile-devices
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
- > - >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi fi
- > - >
for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml; python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
do python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
done
- CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
- > - >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- rm -rf mace-models
quantization_tests:
stage: quantization_tests
script:
- pwd
- rm -rf mace-models
- GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
- CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml
- >
if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
fi
- >
python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
- rm -rf mace-models - rm -rf mace-models
build_android_demo: build_android_demo:
......
...@@ -27,6 +27,7 @@ The main documentation is organized into the following sections: ...@@ -27,6 +27,7 @@ The main documentation is organized into the following sections:
user_guide/basic_usage user_guide/basic_usage
user_guide/advanced_usage user_guide/advanced_usage
user_guide/benchmark
user_guide/op_lists user_guide/op_lists
user_guide/quantization_usage user_guide/quantization_usage
......
...@@ -379,6 +379,8 @@ Useful Commands ...@@ -379,6 +379,8 @@ Useful Commands
* **benchmark and profile model** * **benchmark and profile model**
the detailed information is in :doc:`benchmark`.
.. code:: sh .. code:: sh
# Benchmark model, get detailed statistics of each Op. # Benchmark model, get detailed statistics of each Op.
......
...@@ -227,7 +227,7 @@ to run and validate your model. ...@@ -227,7 +227,7 @@ to run and validate your model.
* **benchmark** * **benchmark**
benchmark and profile the model. benchmark and profile the model. the details are in :doc:`benchmark`.
.. code:: sh .. code:: sh
......
Benchmark usage
===============
This part contains the usage of MACE benchmark tools.
Overview
--------
As mentioned in the previous part, there are two kinds of benchmark tools,
one for operator and the other for model.
Operator Benchmark
------------------
Operator Benchmark is used for test and optimize the performance of specific operator.
=====
Usage
=====
.. code:: bash
python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True --args="--filter=.*BM_CONV.*"
======
Output
======
.. code:: bash
Benchmark Time(ns) Iterations Input(MB/s) GMACPS
------------------------------------------------------------------------------------------------------
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU 1759129 479 114.09 29.21
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU 4031301 226 49.79 12.75
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU 3996357 266 25.11 12.86
MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU 914994 1093 54.84 56.15
===========
Explanation
===========
.. list-table::
:header-rows: 1
* - Options
- Usage
* - Benchmark
- Benchmark unit name.
* - Time
- Time of one round.
* - Iterations
- the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s.
* - Input
- The bandwidth of dealing with input. the unit is MB/s.
* - GMACPS
- The speed of running MACs(multiply-accumulation). the unit is G/s.
Model Benchmark
---------------
Model Benchmark is used for test and optimize the performance of your model.
This tool could record the running time of the model and the detailed running information of each operator of your model.
=====
Usage
=====
.. code:: bash
python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
======
Output
======
.. code:: bash
I benchmark_model.cc:158 ---------------------------------------------------------------------
I benchmark_model.cc:158 Warm Up
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158 | 1 | 51.481 | 51.481 | 51.481 | 51.481 | 51.481 | 0.000 |
I benchmark_model.cc:158 ----------------------------------------------------------------------
I benchmark_model.cc:158
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 Run without statistics
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158 | 100 | 30.272 | 31.390 | 29.938 | 45.966 | 30.913 | 1850.983 |
I benchmark_model.cc:158 -------------------------------------------------------------------------
I benchmark_model.cc:158
I benchmark_model.cc:158 -----------------------------------------------------------------------
I benchmark_model.cc:158 Run with statistics
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I benchmark_model.cc:158 ------------------------------------------------------------------------
I benchmark_model.cc:158 | 100 | 32.358 | 33.327 | 32.293 | 33.607 | 33.002 | 310.435 |
I benchmark_model.cc:158 ------------------------------------------------------------------------
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 Sort by Run Order
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name |
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Transpose | 0.000 | 0.102 | 0.100 | 0.315 | 0.315 | 0.000 | | | | [1,3,224,224] | | input |
I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 5.258 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
I statistics.cc:343 | DepthwiseConv2d | 1.724 | 0.936 | 0.944 | 2.972 | 8.230 | 3.827 | [1,1] | SAME | [1,32,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 |
I statistics.cc:343 | Softmax | 32.835 | 0.039 | 0.042 | 0.131 | 99.996 | 0.000 | | | | [1,1001] | | MobilenetV1/Predictions/Softmax |
I statistics.cc:343 | Identity | 32.880 | 0.001 | 0.001 | 0.004 | 100.000 | 0.000 | | | | [1,1001] | | mace_output_node_MobilenetV1/Predictions/Reshape_1 |
I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 Sort by Computation Time
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name |
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343 | Conv2D | 30.093 | 2.102 | 2.198 | 6.922 | 6.922 | 23.372 | [1,1] | SAME | [1024,1024,1,1] | [1,1024,7,7] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 7.823 | 2.115 | 2.164 | 6.813 | 13.735 | 23.747 | [1,1] | SAME | [128,128,1,1] | [1,128,56,56] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 15.859 | 2.119 | 2.109 | 6.642 | 20.377 | 24.358 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 23.619 | 2.087 | 2.096 | 6.599 | 26.976 | 24.517 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 26.204 | 2.081 | 2.093 | 6.590 | 33.567 | 24.549 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 21.038 | 2.036 | 2.091 | 6.585 | 40.152 | 24.569 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 18.465 | 2.034 | 2.082 | 6.554 | 46.706 | 24.684 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 2.709 | 1.984 | 2.058 | 6.482 | 53.188 | 12.480 | [1,1] | SAME | [64,32,1,1] | [1,64,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 12.220 | 1.788 | 1.901 | 5.986 | 59.174 | 27.027 | [1,1] | SAME | [256,256,1,1] | [1,256,28,28] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 |
I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 64.117 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 |
I statistics.cc:343 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 Stat by Op Type
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 | Op Type | Count | Avg(ms) | % | cdf% | MACs | GMACPS | Called times |
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343 | Conv2D | 15 | 24.978 | 78.693 | 78.693 | 551,355,392 | 22.074 | 15 |
I statistics.cc:343 | DepthwiseConv2d | 13 | 6.543 | 20.614 | 99.307 | 17,385,984 | 2.657 | 13 |
I statistics.cc:343 | Transpose | 1 | 0.100 | 0.315 | 99.622 | 0 | 0.000 | 1 |
I statistics.cc:343 | Pooling | 1 | 0.072 | 0.227 | 99.849 | 0 | 0.000 | 1 |
I statistics.cc:343 | Softmax | 1 | 0.041 | 0.129 | 99.978 | 0 | 0.000 | 1 |
I statistics.cc:343 | Squeeze | 1 | 0.006 | 0.019 | 99.997 | 0 | 0.000 | 1 |
I statistics.cc:343 | Identity | 1 | 0.001 | 0.003 | 100.000 | 0 | 0.000 | 1 |
I statistics.cc:343 ----------------------------------------------------------------------------------------------
I statistics.cc:343
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 Stat by MACs(Multiply-Accumulation)
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 | total | round | first(G/s) | avg(G/s) | std |
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 | 568,741,376 | 100 | 18.330 | 17.909 | 301.326 |
I statistics.cc:343 ---------------------------------------------------------
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 Summary of Ops' Stat
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std |
I statistics.cc:343 ------------------------------------------------------------------------
I statistics.cc:343 | 100 | 31.028 | 32.093 | 31.028 | 32.346 | 31.758 | 301.326 |
I statistics.cc:343 ------------------------------------------------------------------------
===========
Explanation
===========
There are 8 sections of the output information.
1. **Warm Up**
This section lists the time information of warm-up run.
The detailed explanation is list as below.
.. list-table::
:header-rows: 1
* - Key
- Explanation
* - round
- the number of round has been run.
* - first
- the run time of first round. unit is millisecond.
* - curr
- the run time of last round. unit is millisecond.
* - min
- the minimal run time of all rounds. unit is millisecond.
* - max
- the maximal run time of all rounds. unit is millisecond.
* - avg
- the average run time of all rounds. unit is millisecond.
* - std
- the standard deviation of all rounds.
2. **Run without statistics**
This section lists the run time information without statistics code.
the detailed explanation is the same as the section of Warm Up.
3. **Run with statistics**
This section lists the run time information with statistics code,
the time maybe longer compared with the second section.
the detailed explanation is the same as the section of Warm Up.
4. **Sort by Run Order**
This section lists the detailed run information of every operator in your model.
The operators is listed based on the run order, Every line is an operator of your model.
The detailed explanation is list as below.
.. list-table::
:header-rows: 1
* - Key
- Explanation
* - Op Type
- the type of operator.
* - Start
- the start time of the operator. unit is millisecond.
* - First
- the run time of first round. unit is millisecond.
* - Avg
- the average run time of all rounds. unit is millisecond.
* - %
- the percentage of total running time.
* - cdf%
- the cumulative percentage of running time.
* - GMACPS
- The number of run MACs(multiply-accumulation) per second. the unit is G/s.
* - Stride
- the stride parameter of the operator if exist.
* - Pad
- the pad parameter of the operator if exist.
* - Filter Shape
- the filter shape of the operator if exist.
* - Output Shape
- the output shape of the operator.
* - Dilation
- the dilation parameter of the operator if exist.
* - Name
- the name of the operator.
5. **Sort by Computation time**
This section lists the top-10 most time-consuming operators.
The operators is listed based on the computation time,
the detailed explanation is the same as previous section.
6. **Stat by Op Type**
This section stats the run information about operators based on operator type.
.. list-table::
:header-rows: 1
* - Op Type
- the type of operator.
* - Count
- the number of operators with the type.
* - Avg
- the average run time of the operator. unit is millisecond.
* - %
- the percentage of total running time.
* - cdf%
- the cumulative percentage of running time.
* - MACs
- The number of MACs(multiply-accumulation).
* - GMACPS
- The number of MACs(multiply-accumulation) runs per second. the unit is G/s.
* - Called times
- the number of called times in all rounds.
7. **Stat by MACs**
This section stats the MACs information of your model.
.. list-table::
:header-rows: 1
* - total
- the number of MACs of your model.
* - round
- the number of round has been run.
* - First
- the GMAPS of first round. unit is G/s.
* - Avg
- the average GMAPS of all rounds. unit is G/s.
* - std
- the standard deviation of all rounds.
8. **Summary of Ops' Stat**
This section lists the run time information which is summation of every operator's run time.
which may be shorter than the model's run time with statistics.
the detailed explanation is the same as the section of Warm Up.
...@@ -15,6 +15,7 @@ cc_library( ...@@ -15,6 +15,7 @@ cc_library(
srcs = ["statistics.cc"], srcs = ["statistics.cc"],
hdrs = ["statistics.h"], hdrs = ["statistics.h"],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
visibility = ["//visibility:public"],
deps = [ deps = [
"//mace/utils", "//mace/utils",
], ],
......
...@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) { ...@@ -48,23 +48,6 @@ std::vector<std::string> Split(const std::string &str, char delims) {
return result; return result;
} }
bool SplitAndParseToInts(const std::string &str,
char delims,
std::vector<int64_t> *result) {
std::string tmp = str;
while (!tmp.empty()) {
int64_t dim = atoi(tmp.data());
result->push_back(dim);
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return true;
}
} // namespace str_util } // namespace str_util
void ParseShape(const std::string &str, std::vector<int64_t> *shape) { void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <functional>
#include <set> #include <set>
#include "mace/benchmark/statistics.h" #include "mace/benchmark/statistics.h"
...@@ -53,7 +54,6 @@ std::string ShapeToString( ...@@ -53,7 +54,6 @@ std::string ShapeToString(
if (output_shape.empty()) { if (output_shape.empty()) {
return ""; return "";
} }
std::stringstream stream; std::stringstream stream;
stream << "["; stream << "[";
for (size_t i = 0; i < output_shape.size(); ++i) { for (size_t i = 0; i < output_shape.size(); ++i) {
...@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) { ...@@ -94,6 +94,46 @@ std::string VectorToString(const std::vector<T> &vec) {
} // namespace } // namespace
int64_t StatMACs(const std::string &op_type,
const std::vector<int64_t> &filter_shape,
const std::vector<int64_t> &output_shape) {
int64_t macs = 0;
if (op_type == "Conv2D" || op_type == "Deconv2D") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3]
* filter_shape[2] * filter_shape[3] * filter_shape[1];
} else if (op_type == "MatMul") {
macs = std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>())
* filter_shape.back();
} else if (op_type == "DepthwiseConv2d") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3];
} else if (op_type == "DepthwiseDeconv2d") {
macs = output_shape[0] * output_shape[1] * output_shape[2]
* output_shape[3] * filter_shape[2] * filter_shape[3];
} else if (op_type == "FullyConnected") {
macs = output_shape[0] * std::accumulate(filter_shape.begin(),
filter_shape.end(),
1,
std::multiplies<int64_t>());
} else if (op_type == "BatchNorm") {
macs = std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>());
} else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") {
macs = 3 * std::accumulate(output_shape.begin(),
output_shape.end(),
1,
std::multiplies<int64_t>());
}
return macs;
}
void OpStat::StatMetadata(const RunMetadata &meta_data) { void OpStat::StatMetadata(const RunMetadata &meta_data) {
if (meta_data.op_stats.empty()) { if (meta_data.op_stats.empty()) {
LOG(FATAL) << "Op metadata should not be empty"; LOG(FATAL) << "Op metadata should not be empty";
...@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) { ...@@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) {
record->type = op_stat.type; record->type = op_stat.type;
record->args = op_stat.args; record->args = op_stat.args;
record->output_shape = op_stat.output_shape; record->output_shape = op_stat.output_shape;
record->macs =
StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]);
record->order = order_idx; record->order = order_idx;
order_idx += 1; order_idx += 1;
} }
...@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric, ...@@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric,
// generate string // generate string
std::string title = "Sort by " + MetricToString(metric); std::string title = "Sort by " + MetricToString(metric);
const std::vector<std::string> header = { const std::vector<std::string> header = {
"Node Type", "Start", "First", "Avg(ms)", "%", "cdf%", "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS",
"Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name" "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name"
}; };
std::vector<std::vector<std::string>> data; std::vector<std::vector<std::string>> data;
...@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric, ...@@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric,
FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3)); FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3));
tuple.push_back( tuple.push_back(
FloatToString(accumulate_time * 100.f / total_time_.sum(), 3)); FloatToString(accumulate_time * 100.f / total_time_.sum(), 3));
tuple.push_back(FloatToString(
record.macs < 1e-6 ? record.macs :
(record.macs * 1e-3) / record.rel_end.avg(), 3));
tuple.push_back(VectorToString<int>(record.args.strides)); tuple.push_back(VectorToString<int>(record.args.strides));
if (record.args.padding_type != -1) { if (record.args.padding_type != -1) {
tuple.push_back(PaddingTypeToString(record.args.padding_type)); tuple.push_back(PaddingTypeToString(record.args.padding_type));
...@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric, ...@@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric,
return mace::string_util::StringFormatter::Table(title, header, data); return mace::string_util::StringFormatter::Table(title, header, data);
} }
std::string OpStat::StatByNodeType() const { std::string OpStat::StatByOpType() const {
if (records_.empty()) { if (records_.empty()) {
return ""; return "";
} }
const int64_t round = total_time_.round(); const int64_t round = total_time_.round();
int64_t total_time = 0; int64_t total_time = 0;
std::map<std::string, int64_t> type_time_map; std::map<std::string, int64_t> type_time_map;
std::map<std::string, int64_t> type_macs_map;
std::map<std::string, int64_t> type_count_map; std::map<std::string, int64_t> type_count_map;
std::map<std::string, int64_t> type_called_times_map; std::map<std::string, int64_t> type_called_times_map;
std::set<std::string> node_types_set; std::set<std::string> op_types_set;
for (auto &record : records_) { for (auto &record : records_) {
std::string node_type = record.second.type; std::string op_type = record.second.type;
node_types_set.insert(node_type); op_types_set.insert(op_type);
type_time_map[node_type] += record.second.rel_end.sum() / round; type_time_map[op_type] += record.second.rel_end.sum() / round;
type_macs_map[op_type] += record.second.macs;
total_time += record.second.rel_end.sum() / round; total_time += record.second.rel_end.sum() / round;
type_count_map[node_type] += 1; type_count_map[op_type] += 1;
type_called_times_map[node_type] += record.second.called_times / round; type_called_times_map[op_type] += record.second.called_times / round;
} }
std::vector<std::string> node_types(node_types_set.begin(), std::vector<std::string> op_types(op_types_set.begin(),
node_types_set.end()); op_types_set.end());
std::sort(node_types.begin(), node_types.end(), std::sort(op_types.begin(), op_types.end(),
[&](const std::string &lhs, const std::string &rhs) { [&](const std::string &lhs, const std::string &rhs) {
return type_time_map[lhs] > type_time_map[rhs]; return type_time_map[lhs] > type_time_map[rhs];
}); });
std::string title = "Stat by node type"; std::string title = "Stat by Op Type";
const std::vector<std::string> header = { const std::vector<std::string> header = {
"Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times" "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs",
"GMACPS", "Called times"
}; };
float cdf = 0.0f; float cdf = 0.0f;
std::vector<std::vector<std::string>> data; std::vector<std::vector<std::string>> data;
for (auto type : node_types) { for (auto type : op_types) {
const float avg_time = type_time_map[type] / 1000.0f; const float avg_time = type_time_map[type] / 1000.0f;
const float percentage = type_time_map[type] * 100.0f / total_time; const float percentage = type_time_map[type] * 100.0f / total_time;
cdf += percentage; cdf += percentage;
...@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const { ...@@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const {
tuple.push_back(FloatToString(avg_time, 3)); tuple.push_back(FloatToString(avg_time, 3));
tuple.push_back(FloatToString(percentage, 3)); tuple.push_back(FloatToString(percentage, 3));
tuple.push_back(FloatToString(cdf, 3)); tuple.push_back(FloatToString(cdf, 3));
tuple.push_back(IntToString(type_macs_map[type]));
tuple.push_back(FloatToString(
type_macs_map[type] < 1e-6 ? type_macs_map[type] :
(type_macs_map[type] * 1e-3) / type_time_map[type], 3));
tuple.push_back(IntToString(type_called_times_map[type])); tuple.push_back(IntToString(type_called_times_map[type]));
data.emplace_back(tuple); data.emplace_back(tuple);
} }
return mace::string_util::StringFormatter::Table(title, header, data); return mace::string_util::StringFormatter::Table(title, header, data);
} }
std::string OpStat::StatByMACs() const {
if (records_.empty()) {
return "";
}
const int64_t round = total_time_.round();
int64_t count = 0;
for (auto &record : records_) {
count += record.second.macs;
}
std::string title = "Stat by MACs(Multiply-Accumulation)";
const std::vector<std::string> header = {
"total", "round", "first(G/s)", "avg(G/s)", "std"
};
std::vector<std::vector<std::string>> data;
std::vector<std::string> tuple;
tuple.push_back(IntToString(count));
tuple.push_back(IntToString(round));
tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3));
tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3));
tuple.push_back(FloatToString(total_time_.std_deviation(), 3));
data.emplace_back(tuple);
return mace::string_util::StringFormatter::Table(title, header, data);
}
std::string OpStat::Summary() const { std::string OpStat::Summary() const {
std::stringstream stream; std::stringstream stream;
if (!records_.empty()) { if (!records_.empty()) {
...@@ -252,9 +331,11 @@ void OpStat::PrintStat() const { ...@@ -252,9 +331,11 @@ void OpStat::PrintStat() const {
stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl; stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl;
// top-10 op stat by time // top-10 op stat by time
stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl; stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl;
// op stat by node type // op stat by op type
stream << StatByNodeType() << std::endl; stream << StatByOpType() << std::endl;
} }
// print MACs statistics
stream << StatByMACs();
// Print summary // Print summary
stream << Summary(); stream << Summary();
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <cmath> #include <cmath>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
#include <locale>
#include <map> #include <map>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -33,11 +34,33 @@ class RunMetadata; ...@@ -33,11 +34,33 @@ class RunMetadata;
namespace benchmark { namespace benchmark {
// stat the number of multiply-accumulate(MAC)
int64_t StatMACs(const std::string &op_type,
const std::vector<int64_t> &filter_shape,
const std::vector<int64_t> &output_shape);
template <typename IntType> template <typename IntType>
std::string IntToString(const IntType v) { std::string IntToString(const IntType v) {
std::stringstream stream; std::stringstream stream;
stream << v; stream << v;
return stream.str(); std::string src_str = stream.str();
size_t size = src_str.size();
size_t dst_size = size + ((size-1) / 3);
if (src_str[0] == '-') {
dst_size = size + ((size-2) / 3);
}
std::string result(dst_size, ',');
size_t dst_idx = dst_size - 1;
for (size_t src_idx = 0; src_idx < size; ++src_idx) {
if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) {
result[dst_idx] = src_str[size - 1 - src_idx];
} else {
dst_idx -= 1;
result[dst_idx] = src_str[size - 1 - src_idx];
}
dst_idx -= 1;
}
return result;
} }
template <typename FloatType> template <typename FloatType>
...@@ -127,7 +150,7 @@ enum Metric { ...@@ -127,7 +150,7 @@ enum Metric {
COMPUTATION_TIME, COMPUTATION_TIME,
}; };
class OpStat{ class OpStat {
public: public:
void StatMetadata(const RunMetadata &meta_data); void StatMetadata(const RunMetadata &meta_data);
...@@ -136,7 +159,8 @@ class OpStat{ ...@@ -136,7 +159,8 @@ class OpStat{
private: private:
std::string StatByMetric(const Metric metric, std::string StatByMetric(const Metric metric,
const int top_limit) const; const int top_limit) const;
std::string StatByNodeType() const; std::string StatByOpType() const;
std::string StatByMACs() const;
std::string Summary() const; std::string Summary() const;
private: private:
...@@ -145,6 +169,7 @@ class OpStat{ ...@@ -145,6 +169,7 @@ class OpStat{
std::string type; std::string type;
std::vector<std::vector<int64_t>> output_shape; std::vector<std::vector<int64_t>> output_shape;
ConvPoolArgs args; ConvPoolArgs args;
int64_t macs;
int64_t order; int64_t order;
TimeInfo<int64_t> start; TimeInfo<int64_t> start;
TimeInfo<int64_t> rel_end; TimeInfo<int64_t> rel_end;
......
...@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
std::string type = op->debug_def().type(); std::string type = op->debug_def().type();
if (type.compare("Conv2D") == 0 || if (type.compare("Conv2D") == 0 ||
type.compare("FusedConv2D") == 0 || type.compare("Deconv2D") == 0 ||
type.compare("DepthwiseConv2d") == 0 || type.compare("DepthwiseConv2d") == 0 ||
type.compare("DepthwiseDeconv2d") == 0 ||
type.compare("Pooling") == 0) { type.compare("Pooling") == 0) {
strides = op->GetRepeatedArgs<int>("strides"); strides = op->GetRepeatedArgs<int>("strides");
padding_type = op->GetOptionalArg<int>("padding", -1); padding_type = op->GetOptionalArg<int>("padding", -1);
...@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} else { } else {
kernels = op->Input(1)->shape(); kernels = op->Input(1)->shape();
} }
} else if (type.compare("MatMul") == 0) {
bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
kernels = op->Input(0)->shape();
if (transpose_a) {
std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
}
} else if (type.compare("FullyConnected") == 0) {
kernels = op->Input(1)->shape();
} }
std::vector<std::vector<int64_t>> output_shapes; std::vector<std::vector<int64_t>> output_shapes;
......
...@@ -28,7 +28,7 @@ namespace testing { ...@@ -28,7 +28,7 @@ namespace testing {
static std::vector<Benchmark *> *all_benchmarks = nullptr; static std::vector<Benchmark *> *all_benchmarks = nullptr;
static int64_t bytes_processed; static int64_t bytes_processed;
static int64_t macc_processed; static int64_t macs_processed = 0;
static int64_t accum_time = 0; static int64_t accum_time = 0;
static int64_t start_time = 0; static int64_t start_time = 0;
...@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) { ...@@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) {
// Internal perf regression tools depends on the output formatting, // Internal perf regression tools depends on the output formatting,
// please keep in consistent when modifying // please keep in consistent when modifying
printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)", printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
"Iterations", "Input(MB/s)", "MACC(G/s)"); "Iterations", "Input(MB/s)", "GMACPS");
printf("%s\n", std::string(width + 44, '-').c_str()); printf("%s\n", std::string(width + 45, '-').c_str());
for (auto b : *all_benchmarks) { for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue; if (!std::regex_match(b->name_, match, regex)) continue;
int iters; int iters;
...@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) { ...@@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) {
b->Run(&iters, &seconds); b->Run(&iters, &seconds);
float mbps = (bytes_processed * 1e-6) / seconds; float mbps = (bytes_processed * 1e-6) / seconds;
// MACCs or other computations // MACCs or other computations
float gmaccs = (macc_processed * 1e-9) / seconds; float gmacs = (macs_processed * 1e-9) / seconds;
printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(), printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(),
seconds * 1e9 / iters, iters, mbps, gmaccs); seconds * 1e9 / iters, iters, mbps, gmacs);
} }
} }
...@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { ...@@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
int64_t iters = kMinIters; int64_t iters = kMinIters;
while (true) { while (true) {
bytes_processed = -1; bytes_processed = -1;
macc_processed = -1; macs_processed = 0;
RestartTiming(); RestartTiming();
(*benchmark_func_)(iters); (*benchmark_func_)(iters);
StopTiming(); StopTiming();
...@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { ...@@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) {
} }
void BytesProcessed(int64_t n) { bytes_processed = n; } void BytesProcessed(int64_t n) { bytes_processed = n; }
void MaccProcessed(int64_t n) { macc_processed = n; } void MacsProcessed(int64_t n) { macs_processed = n; }
void RestartTiming() { void RestartTiming() {
accum_time = 0; accum_time = 0;
start_time = NowMicros(); start_time = NowMicros();
......
...@@ -42,7 +42,7 @@ class Benchmark { ...@@ -42,7 +42,7 @@ class Benchmark {
}; };
void BytesProcessed(int64_t); void BytesProcessed(int64_t);
void MaccProcessed(int64_t); void MacsProcessed(int64_t);
void RestartTiming(); void RestartTiming();
void StartTiming(); void StartTiming();
void StopTiming(); void StopTiming();
......
...@@ -230,6 +230,7 @@ cc_test( ...@@ -230,6 +230,7 @@ cc_test(
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"test", "test",
"//mace/benchmark:statistics",
"//mace/core:test_benchmark_main", "//mace/core:test_benchmark_main",
"//third_party/eigen3", "//third_party/eigen3",
], ],
......
...@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
...@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
...@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
...@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
...@@ -292,7 +288,6 @@ void SigmoidBenchmark( ...@@ -292,7 +288,6 @@ void SigmoidBenchmark(
static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \ } \
......
...@@ -75,7 +75,7 @@ void BatchNorm( ...@@ -75,7 +75,7 @@ void BatchNorm(
static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MacsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \ BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -58,7 +58,6 @@ void BMBatchToSpace( ...@@ -58,7 +58,6 @@ void BMBatchToSpace(
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\ MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \ BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \ } \
......
...@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \ BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters, ...@@ -68,7 +68,6 @@ void FilterBufferToImage(int iters,
static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \ FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \
} \ } \
......
...@@ -61,7 +61,6 @@ void ChannelShuffle( ...@@ -61,7 +61,6 @@ void ChannelShuffle(
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \ ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
......
...@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) { ...@@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) {
net.Run(); net.Run();
} }
const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2; const int64_t tot = static_cast<int64_t>(iters) * dim0 * dim1 * 2;
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
...@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters, ...@@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters,
const int64_t tot = const int64_t tot =
static_cast<int64_t>(iters) * static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <algorithm> #include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters, ...@@ -154,9 +155,10 @@ void Conv2d<CPU, uint8_t>(int iters,
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \ (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \ int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \ (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
mace::testing::MaccProcessed(macc); \ "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \ mace::Padding::P, OC); \
......
...@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) { ...@@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) {
net.RunOp(D); net.RunOp(D);
} }
const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1; const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * dim1;
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
...@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters, ...@@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters,
const int64_t tot = const int64_t tot =
static_cast<int64_t>(iters) * static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <algorithm> #include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -90,9 +91,10 @@ static void Deconv2d(int iters, ...@@ -90,9 +91,10 @@ static void Deconv2d(int iters,
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t oh = OH; \ int64_t oh = OH; \
int64_t ow = OW; \ int64_t ow = OW; \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
mace::testing::MaccProcessed(macc); \ "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \
mace::Padding::P, OC); \ mace::Padding::P, OC); \
......
...@@ -62,7 +62,6 @@ void DepthToSpace( ...@@ -62,7 +62,6 @@ void DepthToSpace(
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \ DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <algorithm> #include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters, ...@@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters,
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \ int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \ static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
mace::testing::MaccProcessed(macc); \ "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \ DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \ mace::Padding::P, M); \
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <algorithm> #include <algorithm>
#include "mace/benchmark/statistics.h"
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters, ...@@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters,
##_##TYPE##_##DEVICE( \ ##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * H * W * KH * KW * C; \ static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
mace::testing::MaccProcessed(macc); \ "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C}); \
mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P); \ DepthwiseDeconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, S, P); \
} \ } \
MACE_BENCHMARK( \ MACE_BENCHMARK( \
MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\ MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\
......
...@@ -66,7 +66,6 @@ void EltwiseBenchmark( ...@@ -66,7 +66,6 @@ void EltwiseBenchmark(
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \ EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C); \ iters, static_cast<ops::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <string> #include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>( ...@@ -104,11 +105,12 @@ void FCBenchmark<CPU, uint8_t>(
#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \ static_cast<int64_t>(iters) * mace::benchmark::StatMACs( \
"FullyConnected", {OC, H, W, C}, {N, 1, 1, OC}); \
const int64_t tot = \ const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \ static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \ FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \ } \
......
...@@ -66,7 +66,6 @@ void GatherBenchmark(int iters, ...@@ -66,7 +66,6 @@ void GatherBenchmark(int iters,
MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \ MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED; \ const int64_t tot = static_cast<int64_t>(iters) * N * IND * EMBED; \
mace::testing::MaccProcessed(0); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED); \ GatherBenchmark<DEVICE, TYPE>(iters, N, IND, VOC, EMBED); \
} \ } \
......
...@@ -59,7 +59,6 @@ static void LocalResponseNorm( ...@@ -59,7 +59,6 @@ static void LocalResponseNorm(
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \ LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) { ...@@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
static void \ static void \
MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\ MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>( \ static_cast<int64_t>( \
iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS; \ iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS; \
const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE; \ const int64_t tot = static_cast<int64_t>(iters) * N * INPUT_SIZE; \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \ mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \
LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS); \ LSTMCell<DEVICE, TYPE>(iters, N, INPUT_SIZE, HIDDEN_UNITS); \
} \ } \
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "public/gemmlowp.h" #include "public/gemmlowp.h"
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/gemm.h" #include "mace/ops/gemm.h"
#include "mace/ops/sgemm.h" #include "mace/ops/sgemm.h"
...@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { ...@@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
#define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE) \ #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE) \
static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \ static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * M * K * N; \ const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {K}, {M, N}); \
const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \ const int64_t tot = static_cast<int64_t>(iters) * (M + N) * K; \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot * sizeof(TYPE)); \ mace::testing::BytesProcessed(tot * sizeof(TYPE)); \
MatmulBenchmark_##FUNC(iters, M, K, N); \ MatmulBenchmark_##FUNC(iters, M, K, N); \
} \ } \
...@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark( ...@@ -377,9 +379,10 @@ void MatMulTransposeBenchmark(
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \ const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \ MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \ } \
...@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark( ...@@ -392,9 +395,10 @@ void MatMulTransposeBenchmark(
#define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \ #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \ const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \ MatMulTransposeBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \ } \
......
...@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW( ...@@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW(
static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \ static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot * sizeof(float)); \ mace::testing::BytesProcessed(tot * sizeof(float)); \
MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \ MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \
} \ } \
......
...@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height, ...@@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height,
static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \ Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \ } \
......
...@@ -81,7 +81,6 @@ void Pooling(int iters, ...@@ -81,7 +81,6 @@ void Pooling(int iters,
##TYPE##_##DEVICE( \ ##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ Pooling<DEVICE, TYPE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \ PoolingType::PO); \
......
...@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) { ...@@ -82,7 +82,6 @@ void Dequantize(int iters, int count) {
MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE( \ MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N; \ const int64_t tot = static_cast<int64_t>(iters) * N; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Quantize<DEVICE, TYPE>(iters, N); \ Quantize<DEVICE, TYPE>(iters, N); \
} \ } \
...@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) { ...@@ -97,7 +96,6 @@ void Dequantize(int iters, int count) {
MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE( \ MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N; \ const int64_t tot = static_cast<int64_t>(iters) * N; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Dequantize<DEVICE, TYPE>(iters, N); \ Dequantize<DEVICE, TYPE>(iters, N); \
} \ } \
......
...@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels, ...@@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels,
MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Reduce<DEVICE, TYPE>(iters, N, C, H, W); \ Reduce<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include <string> #include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters, ...@@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters,
MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \ ##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \ const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C}); \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \ ResizeBicubicBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \ } \
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include <string> #include <string>
#include "mace/benchmark/statistics.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
...@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters, ...@@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters,
MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \ ##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \ const int64_t macs = static_cast<int64_t>(iters) * \
mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C}); \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \ mace::testing::MacsProcessed(macs); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \ ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \ } \
......
...@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) { ...@@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) {
#define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE) \ #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t macc = \ const int64_t macs = \
static_cast<int64_t>(iters) * N * C * H * W; \ static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Reverse<DEVICE, TYPE>(iters, N, C, H, W); \ Reverse<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>( ...@@ -98,7 +98,6 @@ void SoftmaxBenchmark<CPU, uint8_t>(
static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -64,7 +64,6 @@ void BMSpaceToBatch( ...@@ -64,7 +64,6 @@ void BMSpaceToBatch(
MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\ MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \ BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \ } \
......
...@@ -62,7 +62,6 @@ void SpaceToDepth( ...@@ -62,7 +62,6 @@ void SpaceToDepth(
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \ SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \ } \
......
...@@ -65,7 +65,7 @@ void BMSplitHelper(int iters, ...@@ -65,7 +65,7 @@ void BMSplitHelper(int iters,
MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \ MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \ mace::testing::MacsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \ BMSplitHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \
} \ } \
......
...@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels, ...@@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels,
MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W); \ SqrDiffMean<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters, ...@@ -58,7 +58,6 @@ void TransposeBenchmark(int iters,
static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \ TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \
} \ } \
...@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters, ...@@ -72,7 +71,6 @@ void TransposeBenchmark(int iters,
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE(int iters) { \ DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \ } \
......
...@@ -7,10 +7,10 @@ licenses(["notice"]) # Apache 2.0 ...@@ -7,10 +7,10 @@ licenses(["notice"]) # Apache 2.0
load( load(
"//mace:mace.bzl", "//mace:mace.bzl",
"if_android", "if_android",
"if_hexagon_enabled",
"if_not_hexagon_enabled",
"if_openmp_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_openmp_enabled",
"if_android_armv7",
"if_hexagon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
...@@ -32,16 +32,19 @@ cc_test( ...@@ -32,16 +32,19 @@ cc_test(
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_openmp_enabled([
"-fopenmp", "-fopenmp"
"-DMACE_ENABLE_OPENMP", ]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([ ]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]), ]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
...@@ -62,16 +65,19 @@ cc_test( ...@@ -62,16 +65,19 @@ cc_test(
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_openmp_enabled([
"-fopenmp", "-fopenmp"
"-DMACE_ENABLE_OPENMP", ]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([ ]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]), ]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
...@@ -92,16 +98,19 @@ cc_test( ...@@ -92,16 +98,19 @@ cc_test(
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_openmp_enabled([
"-fopenmp", "-fopenmp"
"-DMACE_ENABLE_OPENMP", ]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([ ]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([ ]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON", "-DMACE_ENABLE_HEXAGON",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]), ]),
linkopts = ["-fopenmp"], linkopts = ["-fopenmp"],
linkstatic = 1, linkstatic = 1,
......
...@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi): ...@@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
if len(parts) == 5 and parts[0].startswith("BM_"): if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6) metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] metrics["%s.gmac_per_sec" % parts[0]] = parts[4]
# platform = dev[YAMLKeyword.target_socs] # platform = dev[YAMLKeyword.target_socs]
# model = dev[YAMLKeyword.device_name] # model = dev[YAMLKeyword.device_name]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册