diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5e3a22c5bd6ac679c3d38398dbe190d32cd81f59..f454edf8aa0f3c850eaf477d55d499abea69cc25 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,6 +9,7 @@ stages: - api_test - python_tools_tests - model_tests + - quantization_tests - build_android_demo - ops_benchmark - extra_tests @@ -62,6 +63,14 @@ api_test: - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS + - > + if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then + GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git + DEVICE_CONF_FILE=generic-mobile-devices/devices.yml + fi + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS ops_benchmark: stage: ops_benchmark @@ -103,7 +112,7 @@ ndk_versions_compatible_tests: DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b; + for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b; do new_ndk_path=${prefix_path}${ndk}; if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then @@ -111,8 +120,12 @@ ndk_versions_compatible_tests: export PATH=$ANDROID_NDK_HOME:$PATH; echo "ndk path: $ANDROID_NDK_HOME"; if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; fi done - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH @@ -131,9 +144,9 @@ python_tools_tests: DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; model_tests: stage: model_tests @@ -142,23 +155,39 @@ model_tests: - rm -rf mace-models - rm -rf generic-mobile-devices - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git + - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml - > if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml; - do - python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; - python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - done + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - > python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + - rm -rf mace-models + +quantization_tests: + stage: quantization_tests + script: + - pwd + - rm -rf mace-models + - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git + - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml + - > + if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then + GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git + DEVICE_CONF_FILE=generic-mobile-devices/devices.yml + fi + - > + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - rm -rf mace-models build_android_demo: diff --git a/docs/index.rst b/docs/index.rst index f839a13f7cf8d04c39d63280306ee3fb8dff513b..7545f2aa8c3227a88fc1b1e4fdc1ea194186c474 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,6 +27,7 @@ The main documentation is organized into the following sections: user_guide/basic_usage user_guide/advanced_usage + user_guide/benchmark user_guide/op_lists user_guide/quantization_usage diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 93ebb4f8d1c9f66d9aa600c1b063f2a6b8d488da..8395c45b783588f047e51a9a0bedcae0a5a7bd11 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -379,6 +379,8 @@ Useful Commands * **benchmark and profile model** +the detailed information is in :doc:`benchmark`. + .. code:: sh # Benchmark model, get detailed statistics of each Op. diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst index d4d404baf8d652f169fe029be2a4966880351dd6..6d59a68eced45173ecc8c5e448f20661d34e6ecf 100644 --- a/docs/user_guide/basic_usage.rst +++ b/docs/user_guide/basic_usage.rst @@ -227,7 +227,7 @@ to run and validate your model. * **benchmark** - benchmark and profile the model. + benchmark and profile the model. the details are in :doc:`benchmark`. .. code:: sh diff --git a/docs/user_guide/benchmark.rst b/docs/user_guide/benchmark.rst new file mode 100644 index 0000000000000000000000000000000000000000..a190a7cc4443a8c8e9147bcd91dd8f765c43268d --- /dev/null +++ b/docs/user_guide/benchmark.rst @@ -0,0 +1,293 @@ +Benchmark usage +=============== + +This part contains the usage of MACE benchmark tools. + +Overview +-------- + +As mentioned in the previous part, there are two kinds of benchmark tools, +one for operator and the other for model. + +Operator Benchmark +------------------ + +Operator Benchmark is used for test and optimize the performance of specific operator. + +===== +Usage +===== + + .. code:: bash + + python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True --args="--filter=.*BM_CONV.*" + +====== +Output +====== + + .. code:: bash + + Benchmark Time(ns) Iterations Input(MB/s) GMACPS + ------------------------------------------------------------------------------------------------------ + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU 1759129 479 114.09 29.21 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU 4031301 226 49.79 12.75 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU 3996357 266 25.11 12.86 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU 914994 1093 54.84 56.15 + + +=========== +Explanation +=========== + +.. list-table:: + :header-rows: 1 + + * - Options + - Usage + * - Benchmark + - Benchmark unit name. + * - Time + - Time of one round. + * - Iterations + - the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s. + * - Input + - The bandwidth of dealing with input. the unit is MB/s. + * - GMACPS + - The speed of running MACs(multiply-accumulation). the unit is G/s. + +Model Benchmark +--------------- + +Model Benchmark is used for test and optimize the performance of your model. +This tool could record the running time of the model and the detailed running information of each operator of your model. + +===== +Usage +===== + + .. code:: bash + + python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml + +====== +Output +====== + + .. code:: bash + + I benchmark_model.cc:158 --------------------------------------------------------------------- + I benchmark_model.cc:158 Warm Up + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 | 1 | 51.481 | 51.481 | 51.481 | 51.481 | 51.481 | 0.000 | + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 Run without statistics + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 | 100 | 30.272 | 31.390 | 29.938 | 45.966 | 30.913 | 1850.983 | + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 + I benchmark_model.cc:158 ----------------------------------------------------------------------- + I benchmark_model.cc:158 Run with statistics + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 | 100 | 32.358 | 33.327 | 32.293 | 33.607 | 33.002 | 310.435 | + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 Sort by Run Order + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name | + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Transpose | 0.000 | 0.102 | 0.100 | 0.315 | 0.315 | 0.000 | | | | [1,3,224,224] | | input | + I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 5.258 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 | + I statistics.cc:343 | DepthwiseConv2d | 1.724 | 0.936 | 0.944 | 2.972 | 8.230 | 3.827 | [1,1] | SAME | [1,32,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 | + I statistics.cc:343 | Softmax | 32.835 | 0.039 | 0.042 | 0.131 | 99.996 | 0.000 | | | | [1,1001] | | MobilenetV1/Predictions/Softmax | + I statistics.cc:343 | Identity | 32.880 | 0.001 | 0.001 | 0.004 | 100.000 | 0.000 | | | | [1,1001] | | mace_output_node_MobilenetV1/Predictions/Reshape_1 | + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 Sort by Computation Time + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name | + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Conv2D | 30.093 | 2.102 | 2.198 | 6.922 | 6.922 | 23.372 | [1,1] | SAME | [1024,1024,1,1] | [1,1024,7,7] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 7.823 | 2.115 | 2.164 | 6.813 | 13.735 | 23.747 | [1,1] | SAME | [128,128,1,1] | [1,128,56,56] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 15.859 | 2.119 | 2.109 | 6.642 | 20.377 | 24.358 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 23.619 | 2.087 | 2.096 | 6.599 | 26.976 | 24.517 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 26.204 | 2.081 | 2.093 | 6.590 | 33.567 | 24.549 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 21.038 | 2.036 | 2.091 | 6.585 | 40.152 | 24.569 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 18.465 | 2.034 | 2.082 | 6.554 | 46.706 | 24.684 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 2.709 | 1.984 | 2.058 | 6.482 | 53.188 | 12.480 | [1,1] | SAME | [64,32,1,1] | [1,64,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 12.220 | 1.788 | 1.901 | 5.986 | 59.174 | 27.027 | [1,1] | SAME | [256,256,1,1] | [1,256,28,28] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 64.117 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 | + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 Stat by Op Type + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Count | Avg(ms) | % | cdf% | MACs | GMACPS | Called times | + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 | Conv2D | 15 | 24.978 | 78.693 | 78.693 | 551,355,392 | 22.074 | 15 | + I statistics.cc:343 | DepthwiseConv2d | 13 | 6.543 | 20.614 | 99.307 | 17,385,984 | 2.657 | 13 | + I statistics.cc:343 | Transpose | 1 | 0.100 | 0.315 | 99.622 | 0 | 0.000 | 1 | + I statistics.cc:343 | Pooling | 1 | 0.072 | 0.227 | 99.849 | 0 | 0.000 | 1 | + I statistics.cc:343 | Softmax | 1 | 0.041 | 0.129 | 99.978 | 0 | 0.000 | 1 | + I statistics.cc:343 | Squeeze | 1 | 0.006 | 0.019 | 99.997 | 0 | 0.000 | 1 | + I statistics.cc:343 | Identity | 1 | 0.001 | 0.003 | 100.000 | 0 | 0.000 | 1 | + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 Stat by MACs(Multiply-Accumulation) + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 | total | round | first(G/s) | avg(G/s) | std | + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 | 568,741,376 | 100 | 18.330 | 17.909 | 301.326 | + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 Summary of Ops' Stat + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 | 100 | 31.028 | 32.093 | 31.028 | 32.346 | 31.758 | 301.326 | + I statistics.cc:343 ------------------------------------------------------------------------ + + +=========== +Explanation +=========== + +There are 8 sections of the output information. + +1. **Warm Up** + +This section lists the time information of warm-up run. +The detailed explanation is list as below. + +.. list-table:: + :header-rows: 1 + + * - Key + - Explanation + * - round + - the number of round has been run. + * - first + - the run time of first round. unit is millisecond. + * - curr + - the run time of last round. unit is millisecond. + * - min + - the minimal run time of all rounds. unit is millisecond. + * - max + - the maximal run time of all rounds. unit is millisecond. + * - avg + - the average run time of all rounds. unit is millisecond. + * - std + - the standard deviation of all rounds. + +2. **Run without statistics** + +This section lists the run time information without statistics code. + the detailed explanation is the same as the section of Warm Up. + +3. **Run with statistics** + +This section lists the run time information with statistics code, + the time maybe longer compared with the second section. + the detailed explanation is the same as the section of Warm Up. + +4. **Sort by Run Order** + +This section lists the detailed run information of every operator in your model. +The operators is listed based on the run order, Every line is an operator of your model. +The detailed explanation is list as below. + +.. list-table:: + :header-rows: 1 + + * - Key + - Explanation + * - Op Type + - the type of operator. + * - Start + - the start time of the operator. unit is millisecond. + * - First + - the run time of first round. unit is millisecond. + * - Avg + - the average run time of all rounds. unit is millisecond. + * - % + - the percentage of total running time. + * - cdf% + - the cumulative percentage of running time. + * - GMACPS + - The number of run MACs(multiply-accumulation) per second. the unit is G/s. + * - Stride + - the stride parameter of the operator if exist. + * - Pad + - the pad parameter of the operator if exist. + * - Filter Shape + - the filter shape of the operator if exist. + * - Output Shape + - the output shape of the operator. + * - Dilation + - the dilation parameter of the operator if exist. + * - Name + - the name of the operator. + +5. **Sort by Computation time** + +This section lists the top-10 most time-consuming operators. +The operators is listed based on the computation time, +the detailed explanation is the same as previous section. + +6. **Stat by Op Type** + +This section stats the run information about operators based on operator type. + +.. list-table:: + :header-rows: 1 + + * - Op Type + - the type of operator. + * - Count + - the number of operators with the type. + * - Avg + - the average run time of the operator. unit is millisecond. + * - % + - the percentage of total running time. + * - cdf% + - the cumulative percentage of running time. + * - MACs + - The number of MACs(multiply-accumulation). + * - GMACPS + - The number of MACs(multiply-accumulation) runs per second. the unit is G/s. + * - Called times + - the number of called times in all rounds. + +7. **Stat by MACs** + +This section stats the MACs information of your model. + +.. list-table:: + :header-rows: 1 + + * - total + - the number of MACs of your model. + * - round + - the number of round has been run. + * - First + - the GMAPS of first round. unit is G/s. + * - Avg + - the average GMAPS of all rounds. unit is G/s. + * - std + - the standard deviation of all rounds. + +8. **Summary of Ops' Stat** + +This section lists the run time information which is summation of every operator's run time. +which may be shorter than the model's run time with statistics. +the detailed explanation is the same as the section of Warm Up. diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD index b086ad24479e74a31e429fc53454ebc38021bdc3..3fe9a006b942f79274bdaf72f6e75845cde59d0c 100644 --- a/mace/benchmark/BUILD +++ b/mace/benchmark/BUILD @@ -15,6 +15,7 @@ cc_library( srcs = ["statistics.cc"], hdrs = ["statistics.h"], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], + visibility = ["//visibility:public"], deps = [ "//mace/utils", ], diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 7f0afe2405c2bd6f07545a34f7b5deaa17ebd145..bcb9ae752602e08bbf9cec48ef7934ccde1dcef0 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -48,23 +48,6 @@ std::vector Split(const std::string &str, char delims) { return result; } -bool SplitAndParseToInts(const std::string &str, - char delims, - std::vector *result) { - std::string tmp = str; - while (!tmp.empty()) { - int64_t dim = atoi(tmp.data()); - result->push_back(dim); - size_t next_offset = tmp.find(delims); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return true; -} - } // namespace str_util void ParseShape(const std::string &str, std::vector *shape) { diff --git a/mace/benchmark/statistics.cc b/mace/benchmark/statistics.cc index 0f05798c9881660ee59e19600bd045b315472d37..7329c247854679f3dbc12620e75f0b7c02503a54 100644 --- a/mace/benchmark/statistics.cc +++ b/mace/benchmark/statistics.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "mace/benchmark/statistics.h" @@ -53,7 +54,6 @@ std::string ShapeToString( if (output_shape.empty()) { return ""; } - std::stringstream stream; stream << "["; for (size_t i = 0; i < output_shape.size(); ++i) { @@ -94,6 +94,46 @@ std::string VectorToString(const std::vector &vec) { } // namespace + +int64_t StatMACs(const std::string &op_type, + const std::vector &filter_shape, + const std::vector &output_shape) { + int64_t macs = 0; + if (op_type == "Conv2D" || op_type == "Deconv2D") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] + * filter_shape[2] * filter_shape[3] * filter_shape[1]; + } else if (op_type == "MatMul") { + macs = std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()) + * filter_shape.back(); + } else if (op_type == "DepthwiseConv2d") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3]; + } else if (op_type == "DepthwiseDeconv2d") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] * filter_shape[2] * filter_shape[3]; + } else if (op_type == "FullyConnected") { + macs = output_shape[0] * std::accumulate(filter_shape.begin(), + filter_shape.end(), + 1, + std::multiplies()); + } else if (op_type == "BatchNorm") { + macs = std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()); + } else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") { + macs = 3 * std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()); + } + return macs; +} + void OpStat::StatMetadata(const RunMetadata &meta_data) { if (meta_data.op_stats.empty()) { LOG(FATAL) << "Op metadata should not be empty"; @@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) { record->type = op_stat.type; record->args = op_stat.args; record->output_shape = op_stat.output_shape; + record->macs = + StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]); record->order = order_idx; order_idx += 1; } @@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric, // generate string std::string title = "Sort by " + MetricToString(metric); const std::vector header = { - "Node Type", "Start", "First", "Avg(ms)", "%", "cdf%", + "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS", "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name" }; std::vector> data; @@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric, FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3)); tuple.push_back( FloatToString(accumulate_time * 100.f / total_time_.sum(), 3)); + tuple.push_back(FloatToString( + record.macs < 1e-6 ? record.macs : + (record.macs * 1e-3) / record.rel_end.avg(), 3)); tuple.push_back(VectorToString(record.args.strides)); if (record.args.padding_type != -1) { tuple.push_back(PaddingTypeToString(record.args.padding_type)); @@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric, return mace::string_util::StringFormatter::Table(title, header, data); } -std::string OpStat::StatByNodeType() const { +std::string OpStat::StatByOpType() const { if (records_.empty()) { return ""; } const int64_t round = total_time_.round(); int64_t total_time = 0; std::map type_time_map; + std::map type_macs_map; std::map type_count_map; std::map type_called_times_map; - std::set node_types_set; + std::set op_types_set; for (auto &record : records_) { - std::string node_type = record.second.type; - node_types_set.insert(node_type); + std::string op_type = record.second.type; + op_types_set.insert(op_type); - type_time_map[node_type] += record.second.rel_end.sum() / round; + type_time_map[op_type] += record.second.rel_end.sum() / round; + type_macs_map[op_type] += record.second.macs; total_time += record.second.rel_end.sum() / round; - type_count_map[node_type] += 1; - type_called_times_map[node_type] += record.second.called_times / round; + type_count_map[op_type] += 1; + type_called_times_map[op_type] += record.second.called_times / round; } - std::vector node_types(node_types_set.begin(), - node_types_set.end()); - std::sort(node_types.begin(), node_types.end(), + std::vector op_types(op_types_set.begin(), + op_types_set.end()); + std::sort(op_types.begin(), op_types.end(), [&](const std::string &lhs, const std::string &rhs) { return type_time_map[lhs] > type_time_map[rhs]; }); - std::string title = "Stat by node type"; + std::string title = "Stat by Op Type"; const std::vector header = { - "Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times" + "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs", + "GMACPS", "Called times" }; float cdf = 0.0f; std::vector> data; - for (auto type : node_types) { + for (auto type : op_types) { const float avg_time = type_time_map[type] / 1000.0f; const float percentage = type_time_map[type] * 100.0f / total_time; cdf += percentage; @@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const { tuple.push_back(FloatToString(avg_time, 3)); tuple.push_back(FloatToString(percentage, 3)); tuple.push_back(FloatToString(cdf, 3)); + tuple.push_back(IntToString(type_macs_map[type])); + tuple.push_back(FloatToString( + type_macs_map[type] < 1e-6 ? type_macs_map[type] : + (type_macs_map[type] * 1e-3) / type_time_map[type], 3)); tuple.push_back(IntToString(type_called_times_map[type])); data.emplace_back(tuple); } return mace::string_util::StringFormatter::Table(title, header, data); } + +std::string OpStat::StatByMACs() const { + if (records_.empty()) { + return ""; + } + const int64_t round = total_time_.round(); + int64_t count = 0; + for (auto &record : records_) { + count += record.second.macs; + } + + std::string title = "Stat by MACs(Multiply-Accumulation)"; + const std::vector header = { + "total", "round", "first(G/s)", "avg(G/s)", "std" + }; + + std::vector> data; + std::vector tuple; + tuple.push_back(IntToString(count)); + tuple.push_back(IntToString(round)); + tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3)); + tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3)); + tuple.push_back(FloatToString(total_time_.std_deviation(), 3)); + data.emplace_back(tuple); + return mace::string_util::StringFormatter::Table(title, header, data); +} + std::string OpStat::Summary() const { std::stringstream stream; if (!records_.empty()) { @@ -252,9 +331,11 @@ void OpStat::PrintStat() const { stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl; // top-10 op stat by time stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl; - // op stat by node type - stream << StatByNodeType() << std::endl; + // op stat by op type + stream << StatByOpType() << std::endl; } + // print MACs statistics + stream << StatByMACs(); // Print summary stream << Summary(); diff --git a/mace/benchmark/statistics.h b/mace/benchmark/statistics.h index 52f963e5331ae095d88f79bbbb8db756fa02d954..f0cf2be69dbba660a0326665a6a3b6b282eef4b7 100644 --- a/mace/benchmark/statistics.h +++ b/mace/benchmark/statistics.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -33,11 +34,33 @@ class RunMetadata; namespace benchmark { +// stat the number of multiply-accumulate(MAC) +int64_t StatMACs(const std::string &op_type, + const std::vector &filter_shape, + const std::vector &output_shape); + template std::string IntToString(const IntType v) { std::stringstream stream; stream << v; - return stream.str(); + std::string src_str = stream.str(); + size_t size = src_str.size(); + size_t dst_size = size + ((size-1) / 3); + if (src_str[0] == '-') { + dst_size = size + ((size-2) / 3); + } + std::string result(dst_size, ','); + size_t dst_idx = dst_size - 1; + for (size_t src_idx = 0; src_idx < size; ++src_idx) { + if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) { + result[dst_idx] = src_str[size - 1 - src_idx]; + } else { + dst_idx -= 1; + result[dst_idx] = src_str[size - 1 - src_idx]; + } + dst_idx -= 1; + } + return result; } template @@ -127,7 +150,7 @@ enum Metric { COMPUTATION_TIME, }; -class OpStat{ +class OpStat { public: void StatMetadata(const RunMetadata &meta_data); @@ -136,7 +159,8 @@ class OpStat{ private: std::string StatByMetric(const Metric metric, const int top_limit) const; - std::string StatByNodeType() const; + std::string StatByOpType() const; + std::string StatByMACs() const; std::string Summary() const; private: @@ -145,6 +169,7 @@ class OpStat{ std::string type; std::vector> output_shape; ConvPoolArgs args; + int64_t macs; int64_t order; TimeInfo start; TimeInfo rel_end; diff --git a/mace/core/net.cc b/mace/core/net.cc index 1732cfe1a36f04b9fed6c378e67b4637554113ae..7912a6d4209808c25b7b33b47806f3eedf81112b 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { std::string type = op->debug_def().type(); if (type.compare("Conv2D") == 0 || - type.compare("FusedConv2D") == 0 || + type.compare("Deconv2D") == 0 || type.compare("DepthwiseConv2d") == 0 || + type.compare("DepthwiseDeconv2d") == 0 || type.compare("Pooling") == 0) { strides = op->GetRepeatedArgs("strides"); padding_type = op->GetOptionalArg("padding", -1); @@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { } else { kernels = op->Input(1)->shape(); } + } else if (type.compare("MatMul") == 0) { + bool transpose_a = op->GetOptionalArg("transpose_a", false); + kernels = op->Input(0)->shape(); + if (transpose_a) { + std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]); + } + } else if (type.compare("FullyConnected") == 0) { + kernels = op->Input(1)->shape(); } std::vector> output_shapes; diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc index 5da1750988604683504ad9ccc60af3c5ff8b8fbf..57be33c2a686451a7fb9bccb7e8ce86f13bdfa3e 100644 --- a/mace/core/testing/test_benchmark.cc +++ b/mace/core/testing/test_benchmark.cc @@ -28,7 +28,7 @@ namespace testing { static std::vector *all_benchmarks = nullptr; static int64_t bytes_processed; -static int64_t macc_processed; +static int64_t macs_processed = 0; static int64_t accum_time = 0; static int64_t start_time = 0; @@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) { // Internal perf regression tools depends on the output formatting, // please keep in consistent when modifying printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)", - "Iterations", "Input(MB/s)", "MACC(G/s)"); - printf("%s\n", std::string(width + 44, '-').c_str()); + "Iterations", "Input(MB/s)", "GMACPS"); + printf("%s\n", std::string(width + 45, '-').c_str()); for (auto b : *all_benchmarks) { if (!std::regex_match(b->name_, match, regex)) continue; int iters; @@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) { b->Run(&iters, &seconds); float mbps = (bytes_processed * 1e-6) / seconds; // MACCs or other computations - float gmaccs = (macc_processed * 1e-9) / seconds; + float gmacs = (macs_processed * 1e-9) / seconds; printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(), - seconds * 1e9 / iters, iters, mbps, gmaccs); + seconds * 1e9 / iters, iters, mbps, gmacs); } } @@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { int64_t iters = kMinIters; while (true) { bytes_processed = -1; - macc_processed = -1; + macs_processed = 0; RestartTiming(); (*benchmark_func_)(iters); StopTiming(); @@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { } void BytesProcessed(int64_t n) { bytes_processed = n; } -void MaccProcessed(int64_t n) { macc_processed = n; } +void MacsProcessed(int64_t n) { macs_processed = n; } void RestartTiming() { accum_time = 0; start_time = NowMicros(); diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index b6c070c71caf69dd4e4bad7eddd76d41808da9ce..2eb91e4024ca21cf8e4b24aa26fe523776286589 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -42,7 +42,7 @@ class Benchmark { }; void BytesProcessed(int64_t); -void MaccProcessed(int64_t); +void MacsProcessed(int64_t); void RestartTiming(); void StartTiming(); void StopTiming(); diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 1d8c821d9b7c7da10be25b705cb5865376f345fb..f6e01a74ef7d9d3ac3ef647646a8bf3df85d8667 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -230,6 +230,7 @@ cc_test( linkstatic = 1, deps = [ "test", + "//mace/benchmark:statistics", "//mace/core:test_benchmark_main", "//third_party/eigen3", ], diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 76447e9b6134229a002ac94bb09f58b2f857d038..6faf62cebc221b0e9b37ba765c1d381432db44c0 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluBenchmark(iters, N, C, H, W); \ } \ @@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluxBenchmark(iters, N, C, H, W); \ } \ @@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ PreluBenchmark(iters, N, C, H, W); \ } \ @@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TanhBenchmark(iters, N, C, H, W); \ } \ @@ -292,7 +288,6 @@ void SigmoidBenchmark( static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SigmoidBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index f5e11740d79597bc02e9f2fba3c55a6e286b8a7c..b9751557b830a3c621ad9d62147011a3897dcbec 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * INPUTS * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ AddNBenchmark(iters, INPUTS, N, H, W, C); \ } \ diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index d3467e769f32a69732b366e2d077f5fb6c8959e8..a6afcb077aef3ce9a296bf16b00355ec8a98d268 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -75,7 +75,7 @@ void BatchNorm( static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ + mace::testing::MacsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BatchNorm(iters, N, C, H, W); \ } \ diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 9664a917e6256687a7c0bba75a3c5cb52732071e..64264936d65cc097ac47027de448b1f10dde17f4 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -58,7 +58,6 @@ void BMBatchToSpace( MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMBatchToSpace(iters, N, C, H, W, ARG); \ } \ diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 9026ffb2b2142b4b7d9d99c303401fc759ca0e05..f0604d56446bced04d9c21a017d74c8e8448f9e6 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BiasAdd(iters, N, C, H, W); \ } \ diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index f5f1df413258fc1a1a66729b7af7d39604281039..4ba0f64c1ce2354f3e8b133664303dff59896a07 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -68,7 +68,6 @@ void FilterBufferToImage(int iters, static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * O * I * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FilterBufferToImage(iters, O, I, H, W); \ } \ diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index db5f8494af4d2f0bfceb1288d250572d1e15a830..8ea6d139a30efd2389c003daac152ac36d3e6b15 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -61,7 +61,6 @@ void ChannelShuffle( MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ChannelShuffle(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index a43fc3084f880754612e50d75753d353d09dd04f..eaff9b44256941ef4389610b994ead52f78319f1 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) { net.Run(); } const int64_t tot = static_cast(iters) * dim0 * dim1 * 2; - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { @@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters, const int64_t tot = static_cast(iters) * (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 91efff7974df9e159f531fb4fcd104751e5ed0f4..a0e780032b541d7cd54ab10dbcef8bfee35b7782 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -154,9 +155,10 @@ void Conv2d(int iters, (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \ int64_t ow = \ (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Conv2d(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ mace::Padding::P, OC); \ diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc index aad6f93d610e8ac6eed96bd0aef9bcbcbf27cdca..5133a28abd338dd463c3cda52228457ebdc101d3 100644 --- a/mace/ops/crop_benchmark.cc +++ b/mace/ops/crop_benchmark.cc @@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) { net.RunOp(D); } const int64_t tot = static_cast(iters) * kDim0 * dim1 * dim1; - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { @@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters, const int64_t tot = static_cast(iters) * (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 81be17c092ad0d6e91bbdf0514a4c0d94e641b10..9a2c405dfd6bb287c778f74c03fa1375472924af 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -90,9 +91,10 @@ static void Deconv2d(int iters, const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t oh = OH; \ int64_t ow = OW; \ - const int64_t macc = \ - static_cast(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Deconv2d(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ mace::Padding::P, OC); \ diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index c9c6dd4016b97869289388ecbfbe200347846269..1283e432a7cde4e57929ebd470732f7bb5bed088 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -62,7 +62,6 @@ void DepthToSpace( MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ DepthToSpace(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 4d44a9bc136b59fc5e29dd93343638f65b58db88..c5aee849f171e82ff1190ac18140cdc300e8c059 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters, (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ int64_t ow = \ (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ mace::Padding::P, M); \ diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc index 081e10d27ce6748d397f635d53b9f74673a15c20..a130ca1d3fd5b58a0e9a89b770061f1f84575315 100644 --- a/mace/ops/depthwise_deconv2d_benchmark.cc +++ b/mace/ops/depthwise_deconv2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/operator.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters, ##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - const int64_t macc = \ - static_cast(iters) * N * H * W * KH * KW * C; \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseDeconv2d(iters, N, C, H, W, KH, KW, S, P); \ + DepthwiseDeconv2d(iters, N, C, H, W, KH, KW, S, P); \ } \ MACE_BENCHMARK( \ MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\ diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 95808bc336a46231d920a7c409e846b89725e2ed..b75149bd3bd49e5b83335753eb9c9d5b18d07be2 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -66,7 +66,6 @@ void EltwiseBenchmark( MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ EltwiseBenchmark( \ iters, static_cast(ELT_TYPE), N, H, W, C); \ diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index bb27c97dcdf2197c6f1e60ef59589b4d7a39b429..bb6dcd80eca7c9a2850d96e8d3cc7915267c7e8d 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -104,11 +105,12 @@ void FCBenchmark( #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = \ - static_cast(iters) * N * C * H * W * OC + OC; \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "FullyConnected", {OC, H, W, C}, {N, 1, 1, OC}); \ const int64_t tot = \ static_cast(iters) * (N + OC) * C * H * W + OC; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FCBenchmark(iters, N, H, W, C, OC); \ } \ diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc index 5e52875c2074b3b4c23f5e2dab5ebe7a2119e7d9..7fe4a0fb568742b8391245c8a82c135cd78e48a2 100644 --- a/mace/ops/gather_benchmark.cc +++ b/mace/ops/gather_benchmark.cc @@ -66,7 +66,6 @@ void GatherBenchmark(int iters, MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * IND * EMBED; \ - mace::testing::MaccProcessed(0); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ GatherBenchmark(iters, N, IND, VOC, EMBED); \ } \ diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc index b917c495ffd574df459fd6881f276a9c6e09782f..61207af0c42a11c676f28d2a506304ab70a1458d 100644 --- a/mace/ops/local_response_norm_benchmark.cc +++ b/mace/ops/local_response_norm_benchmark.cc @@ -59,7 +59,6 @@ static void LocalResponseNorm( MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ LocalResponseNorm(iters, N, C, H, W); \ } \ diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc index 6568025a1a169ed856cf3df8704f635bb9824b2b..a3b9609490d4e9965cb89ee0bde45badd8cee870 100644 --- a/mace/ops/lstmcell_benchmark.cc +++ b/mace/ops/lstmcell_benchmark.cc @@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) { static void \ MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\ int iters) { \ - const int64_t macc = \ + const int64_t macs = \ static_cast( \ iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS; \ const int64_t tot = static_cast(iters) * N * INPUT_SIZE; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \ LSTMCell(iters, N, INPUT_SIZE, HIDDEN_UNITS); \ } \ diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index f118e63f4680b68f0f77bc55697cf318f729caaa..1996587ad47ee60aa524b5e48118a45cca8e4a64 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -19,6 +19,7 @@ #include #include "public/gemmlowp.h" +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/gemm.h" #include "mace/ops/sgemm.h" @@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE) \ static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \ - const int64_t macc = static_cast(iters) * M * K * N; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {K}, {M, N}); \ const int64_t tot = static_cast(iters) * (M + N) * K; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot * sizeof(TYPE)); \ MatmulBenchmark_##FUNC(iters, M, K, N); \ } \ @@ -377,9 +379,10 @@ void MatMulTransposeBenchmark( #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H * W; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulBenchmark(iters, N, H, C, W); \ } \ @@ -392,9 +395,10 @@ void MatMulTransposeBenchmark( #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \ static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H * W; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulTransposeBenchmark(iters, N, H, C, W); \ } \ diff --git a/mace/ops/memory_benchmark.cc b/mace/ops/memory_benchmark.cc index e3bb30a81f5880d663257d2aafbaab277dee4e9d..73f3bdeb46ce8d1ee2d3013ff5c23aa7e15ab319 100644 --- a/mace/ops/memory_benchmark.cc +++ b/mace/ops/memory_benchmark.cc @@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW( static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot * sizeof(float)); \ MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \ } \ diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index fb7f4e14426677b1ee26bf0ba3459ea5043074ea..0125b4f5b4b1c44e462f077ba6a9d17165764ab3 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height, static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Pad(iters, N, H, W, C, PAD); \ } \ diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index c48cc8771fec57898dfe648abc7db7438bd5e330..880c0cad5462c78dfa4ce0f50816ffb6dbe0d002 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -81,7 +81,6 @@ void Pooling(int iters, ##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ PoolingType::PO); \ diff --git a/mace/ops/quantize_benchmark.cc b/mace/ops/quantize_benchmark.cc index 62a534b721894360b922270fe03833be60ad582a..0c1493b80450a586cde90d80285ad57629cdc276 100644 --- a/mace/ops/quantize_benchmark.cc +++ b/mace/ops/quantize_benchmark.cc @@ -82,7 +82,6 @@ void Dequantize(int iters, int count) { MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Quantize(iters, N); \ } \ @@ -97,7 +96,6 @@ void Dequantize(int iters, int count) { MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Dequantize(iters, N); \ } \ diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc index ec8807b0488892de1ac22eb5136dc5b524482c27..663c3b45d7a3aa3eb585e0ef14c31c1b093933dc 100644 --- a/mace/ops/reduce_benchmark.cc +++ b/mace/ops/reduce_benchmark.cc @@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels, MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Reduce(iters, N, C, H, W); \ } \ diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc index 5ababebaa29676f289c368222bde120acf9c0aca..85e073fd1fba1de4c1e53da9cee19c3b8d964ecc 100644 --- a/mace/ops/resize_bicubic_benchmark.cc +++ b/mace/ops/resize_bicubic_benchmark.cc @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters, MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C}); \ const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBicubicBenchmark(iters, N, C, H0, W0, H1, W1); \ } \ diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index bace4f10374d681df889e6fd5451c37abc2d646c..ddc0f508b0d0e677fac6abd8bb7a61d79087b4e3 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters, MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C}); \ const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ } \ diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc index 9630f696011b5a04e1ee4ed18e03e19de9b1e333..9b7a915a58a3aaed3988889ef2cd80e855a4423a 100644 --- a/mace/ops/reverse_benchmark.cc +++ b/mace/ops/reverse_benchmark.cc @@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) { #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE) \ static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = \ + const int64_t macs = \ static_cast(iters) * N * C * H * W; \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Reverse(iters, N, C, H, W); \ } \ diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 25095da54f94324afd34274f79b09c59c1b4e3a7..819544b289b17547cb0d4443f4408dc2ad60d91f 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -98,7 +98,6 @@ void SoftmaxBenchmark( static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SoftmaxBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index cacadfcd9673019a9c3f7938d72ebc3d45608c96..168461de213d9709dca3f1f9cfb6d3d1fff4f13c 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -64,7 +64,6 @@ void BMSpaceToBatch( MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ } \ diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 3311d6186272cee46cc53f8e6d9426e9eb962295..6bd7755e0c9da1b2503cbf66090f99d361d2fd99 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -62,7 +62,6 @@ void SpaceToDepth( MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SpaceToDepth(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index b21da8f5c7f055437a6a59952c3bea4957636efd..020c32142ce6ffdd743b9c7e4b054062811afa9d 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -65,7 +65,7 @@ void BMSplitHelper(int iters, MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ + mace::testing::MacsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSplitHelper(iters, {N, H, W, C}, NO); \ } \ diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc index 353d8e7addfa4748fb7a160710bea226d3c569ab..1d2a7aa377d7fc8fb9b5c8eeb987b7a20e4ba40b 100644 --- a/mace/ops/sqrdiff_mean_benchmark.cc +++ b/mace/ops/sqrdiff_mean_benchmark.cc @@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels, MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SqrDiffMean(iters, N, C, H, W); \ } \ diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc index f584239a6d3277c934f93eb356b11919e877993e..372f2f9d08917820e3c88b435e01d786715ab050 100644 --- a/mace/ops/transpose_benchmark.cc +++ b/mace/ops/transpose_benchmark.cc @@ -58,7 +58,6 @@ void TransposeBenchmark(int iters, static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {H, W}, {1, 0}); \ } \ @@ -72,7 +71,6 @@ void TransposeBenchmark(int iters, MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ } \ diff --git a/mace/test/BUILD b/mace/test/BUILD index 63faecfe7cb782145054f0abe55564e8ac7ab0f8..283dd486ff812df5d7b729c67a5f1a449a751513 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD @@ -7,10 +7,10 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", - "if_hexagon_enabled", - "if_not_hexagon_enabled", - "if_openmp_enabled", "if_neon_enabled", + "if_openmp_enabled", + "if_android_armv7", + "if_hexagon_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -32,16 +32,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -62,16 +65,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -92,16 +98,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 71f3cc148036dfe0221b07757ad55c1ff09e7536..96ad13a475e681ad9cce2cf78b0198d679bb34d0 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi): if len(parts) == 5 and parts[0].startswith("BM_"): metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6) metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] - metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] + metrics["%s.gmac_per_sec" % parts[0]] = parts[4] # platform = dev[YAMLKeyword.target_socs] # model = dev[YAMLKeyword.device_name]