From 5f021151195a737100d2bcceb21e538cefe4a9e5 Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 12 Dec 2018 13:05:57 +0800 Subject: [PATCH] Add MACs statistics for model benchmark tool and related docs --- .gitlab-ci.yml | 57 +++-- docs/index.rst | 1 + docs/user_guide/advanced_usage.rst | 2 + docs/user_guide/basic_usage.rst | 2 +- docs/user_guide/benchmark.rst | 293 ++++++++++++++++++++++ mace/benchmark/BUILD | 1 + mace/benchmark/benchmark_model.cc | 17 -- mace/benchmark/statistics.cc | 115 +++++++-- mace/benchmark/statistics.h | 31 ++- mace/core/net.cc | 11 +- mace/core/testing/test_benchmark.cc | 14 +- mace/core/testing/test_benchmark.h | 2 +- mace/ops/BUILD | 1 + mace/ops/activation_benchmark.cc | 5 - mace/ops/addn_benchmark.cc | 1 - mace/ops/batch_norm_benchmark.cc | 2 +- mace/ops/batch_to_space_benchmark.cc | 1 - mace/ops/bias_add_benchmark.cc | 1 - mace/ops/buffer_to_image_benchmark.cc | 1 - mace/ops/channel_shuffle_benchmark.cc | 1 - mace/ops/concat_benchmark.cc | 2 - mace/ops/conv_2d_benchmark.cc | 8 +- mace/ops/crop_benchmark.cc | 2 - mace/ops/deconv_2d_benchmark.cc | 8 +- mace/ops/depth_to_space_benchmark.cc | 1 - mace/ops/depthwise_conv2d_benchmark.cc | 8 +- mace/ops/depthwise_deconv2d_benchmark.cc | 10 +- mace/ops/eltwise_benchmark.cc | 1 - mace/ops/fully_connected_benchmark.cc | 8 +- mace/ops/gather_benchmark.cc | 1 - mace/ops/local_response_norm_benchmark.cc | 1 - mace/ops/lstmcell_benchmark.cc | 4 +- mace/ops/matmul_benchmark.cc | 16 +- mace/ops/memory_benchmark.cc | 1 - mace/ops/pad_benchmark.cc | 1 - mace/ops/pooling_benchmark.cc | 1 - mace/ops/quantize_benchmark.cc | 2 - mace/ops/reduce_benchmark.cc | 1 - mace/ops/resize_bicubic_benchmark.cc | 7 +- mace/ops/resize_bilinear_benchmark.cc | 7 +- mace/ops/reverse_benchmark.cc | 3 +- mace/ops/softmax_benchmark.cc | 1 - mace/ops/space_to_batch_benchmark.cc | 1 - mace/ops/space_to_depth_benchmark.cc | 1 - mace/ops/split_benchmark.cc | 2 +- mace/ops/sqrdiff_mean_benchmark.cc | 1 - mace/ops/transpose_benchmark.cc | 2 - mace/test/BUILD | 39 +-- tools/bazel_adb_run.py | 2 +- 49 files changed, 562 insertions(+), 139 deletions(-) create mode 100644 docs/user_guide/benchmark.rst diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5e3a22c5..f454edf8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,6 +9,7 @@ stages: - api_test - python_tools_tests - model_tests + - quantization_tests - build_android_demo - ops_benchmark - extra_tests @@ -62,6 +63,14 @@ api_test: - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS + - > + if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then + GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git + DEVICE_CONF_FILE=generic-mobile-devices/devices.yml + fi + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//mace/test:mace_api_exception_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS ops_benchmark: stage: ops_benchmark @@ -103,7 +112,7 @@ ndk_versions_compatible_tests: DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b; + for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b; do new_ndk_path=${prefix_path}${ndk}; if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then @@ -111,8 +120,12 @@ ndk_versions_compatible_tests: export PATH=$ANDROID_NDK_HOME:$PATH; echo "ndk path: $ANDROID_NDK_HOME"; if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*"; + python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*"; fi done - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH @@ -131,9 +144,9 @@ python_tools_tests: DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,armhf --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,armhf --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; model_tests: stage: model_tests @@ -142,23 +155,39 @@ model_tests: - rm -rf mace-models - rm -rf generic-mobile-devices - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git + - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml - > if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - > - for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml; - do - python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; - python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - done + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - > python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + - rm -rf mace-models + +quantization_tests: + stage: quantization_tests + script: + - pwd + - rm -rf mace-models + - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git + - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml + - > + if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then + GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git + DEVICE_CONF_FILE=generic-mobile-devices/devices.yml + fi + - > + python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; - rm -rf mace-models build_android_demo: diff --git a/docs/index.rst b/docs/index.rst index f839a13f..7545f2aa 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,6 +27,7 @@ The main documentation is organized into the following sections: user_guide/basic_usage user_guide/advanced_usage + user_guide/benchmark user_guide/op_lists user_guide/quantization_usage diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 93ebb4f8..8395c45b 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -379,6 +379,8 @@ Useful Commands * **benchmark and profile model** +the detailed information is in :doc:`benchmark`. + .. code:: sh # Benchmark model, get detailed statistics of each Op. diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst index d4d404ba..6d59a68e 100644 --- a/docs/user_guide/basic_usage.rst +++ b/docs/user_guide/basic_usage.rst @@ -227,7 +227,7 @@ to run and validate your model. * **benchmark** - benchmark and profile the model. + benchmark and profile the model. the details are in :doc:`benchmark`. .. code:: sh diff --git a/docs/user_guide/benchmark.rst b/docs/user_guide/benchmark.rst new file mode 100644 index 00000000..a190a7cc --- /dev/null +++ b/docs/user_guide/benchmark.rst @@ -0,0 +1,293 @@ +Benchmark usage +=============== + +This part contains the usage of MACE benchmark tools. + +Overview +-------- + +As mentioned in the previous part, there are two kinds of benchmark tools, +one for operator and the other for model. + +Operator Benchmark +------------------ + +Operator Benchmark is used for test and optimize the performance of specific operator. + +===== +Usage +===== + + .. code:: bash + + python tools/bazel_adb_run.py --target="//mace/ops:ops_benchmark" --run_target=True --args="--filter=.*BM_CONV.*" + +====== +Output +====== + + .. code:: bash + + Benchmark Time(ns) Iterations Input(MB/s) GMACPS + ------------------------------------------------------------------------------------------------------ + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_CPU 1759129 479 114.09 29.21 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_float_GPU 4031301 226 49.79 12.75 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_half_GPU 3996357 266 25.11 12.86 + MACE_BM_CONV_2D_1_1024_7_7_K1x1S1D1_SAME_1024_uint8_t_CPU 914994 1093 54.84 56.15 + + +=========== +Explanation +=========== + +.. list-table:: + :header-rows: 1 + + * - Options + - Usage + * - Benchmark + - Benchmark unit name. + * - Time + - Time of one round. + * - Iterations + - the number of iterations to run, which is between 10 and 1000,000,000. the value is calculated based on the strategy total run time does not exceed 1s. + * - Input + - The bandwidth of dealing with input. the unit is MB/s. + * - GMACPS + - The speed of running MACs(multiply-accumulation). the unit is G/s. + +Model Benchmark +--------------- + +Model Benchmark is used for test and optimize the performance of your model. +This tool could record the running time of the model and the detailed running information of each operator of your model. + +===== +Usage +===== + + .. code:: bash + + python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml + +====== +Output +====== + + .. code:: bash + + I benchmark_model.cc:158 --------------------------------------------------------------------- + I benchmark_model.cc:158 Warm Up + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 | 1 | 51.481 | 51.481 | 51.481 | 51.481 | 51.481 | 0.000 | + I benchmark_model.cc:158 ---------------------------------------------------------------------- + I benchmark_model.cc:158 + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 Run without statistics + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 | 100 | 30.272 | 31.390 | 29.938 | 45.966 | 30.913 | 1850.983 | + I benchmark_model.cc:158 ------------------------------------------------------------------------- + I benchmark_model.cc:158 + I benchmark_model.cc:158 ----------------------------------------------------------------------- + I benchmark_model.cc:158 Run with statistics + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I benchmark_model.cc:158 | 100 | 32.358 | 33.327 | 32.293 | 33.607 | 33.002 | 310.435 | + I benchmark_model.cc:158 ------------------------------------------------------------------------ + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 Sort by Run Order + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name | + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Transpose | 0.000 | 0.102 | 0.100 | 0.315 | 0.315 | 0.000 | | | | [1,3,224,224] | | input | + I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 5.258 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 | + I statistics.cc:343 | DepthwiseConv2d | 1.724 | 0.936 | 0.944 | 2.972 | 8.230 | 3.827 | [1,1] | SAME | [1,32,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6 | + I statistics.cc:343 | Softmax | 32.835 | 0.039 | 0.042 | 0.131 | 99.996 | 0.000 | | | | [1,1001] | | MobilenetV1/Predictions/Softmax | + I statistics.cc:343 | Identity | 32.880 | 0.001 | 0.001 | 0.004 | 100.000 | 0.000 | | | | [1,1001] | | mace_output_node_MobilenetV1/Predictions/Reshape_1 | + I statistics.cc:343 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 Sort by Computation Time + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Start | First | Avg(ms) | % | cdf% | GMACPS | Stride | Pad | Filter Shape | Output Shape | Dilation | name | + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 | Conv2D | 30.093 | 2.102 | 2.198 | 6.922 | 6.922 | 23.372 | [1,1] | SAME | [1024,1024,1,1] | [1,1024,7,7] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 7.823 | 2.115 | 2.164 | 6.813 | 13.735 | 23.747 | [1,1] | SAME | [128,128,1,1] | [1,128,56,56] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 15.859 | 2.119 | 2.109 | 6.642 | 20.377 | 24.358 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 23.619 | 2.087 | 2.096 | 6.599 | 26.976 | 24.517 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 26.204 | 2.081 | 2.093 | 6.590 | 33.567 | 24.549 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 21.038 | 2.036 | 2.091 | 6.585 | 40.152 | 24.569 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 18.465 | 2.034 | 2.082 | 6.554 | 46.706 | 24.684 | [1,1] | SAME | [512,512,1,1] | [1,512,14,14] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 2.709 | 1.984 | 2.058 | 6.482 | 53.188 | 12.480 | [1,1] | SAME | [64,32,1,1] | [1,64,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 12.220 | 1.788 | 1.901 | 5.986 | 59.174 | 27.027 | [1,1] | SAME | [256,256,1,1] | [1,256,28,28] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6 | + I statistics.cc:343 | Conv2D | 0.107 | 1.541 | 1.570 | 4.943 | 64.117 | 6.904 | [2,2] | SAME | [32,3,3,3] | [1,32,112,112] | [1,1] | MobilenetV1/MobilenetV1/Conv2d_0/Relu6 | + I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 Stat by Op Type + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 | Op Type | Count | Avg(ms) | % | cdf% | MACs | GMACPS | Called times | + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 | Conv2D | 15 | 24.978 | 78.693 | 78.693 | 551,355,392 | 22.074 | 15 | + I statistics.cc:343 | DepthwiseConv2d | 13 | 6.543 | 20.614 | 99.307 | 17,385,984 | 2.657 | 13 | + I statistics.cc:343 | Transpose | 1 | 0.100 | 0.315 | 99.622 | 0 | 0.000 | 1 | + I statistics.cc:343 | Pooling | 1 | 0.072 | 0.227 | 99.849 | 0 | 0.000 | 1 | + I statistics.cc:343 | Softmax | 1 | 0.041 | 0.129 | 99.978 | 0 | 0.000 | 1 | + I statistics.cc:343 | Squeeze | 1 | 0.006 | 0.019 | 99.997 | 0 | 0.000 | 1 | + I statistics.cc:343 | Identity | 1 | 0.001 | 0.003 | 100.000 | 0 | 0.000 | 1 | + I statistics.cc:343 ---------------------------------------------------------------------------------------------- + I statistics.cc:343 + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 Stat by MACs(Multiply-Accumulation) + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 | total | round | first(G/s) | avg(G/s) | std | + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 | 568,741,376 | 100 | 18.330 | 17.909 | 301.326 | + I statistics.cc:343 --------------------------------------------------------- + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 Summary of Ops' Stat + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) | std | + I statistics.cc:343 ------------------------------------------------------------------------ + I statistics.cc:343 | 100 | 31.028 | 32.093 | 31.028 | 32.346 | 31.758 | 301.326 | + I statistics.cc:343 ------------------------------------------------------------------------ + + +=========== +Explanation +=========== + +There are 8 sections of the output information. + +1. **Warm Up** + +This section lists the time information of warm-up run. +The detailed explanation is list as below. + +.. list-table:: + :header-rows: 1 + + * - Key + - Explanation + * - round + - the number of round has been run. + * - first + - the run time of first round. unit is millisecond. + * - curr + - the run time of last round. unit is millisecond. + * - min + - the minimal run time of all rounds. unit is millisecond. + * - max + - the maximal run time of all rounds. unit is millisecond. + * - avg + - the average run time of all rounds. unit is millisecond. + * - std + - the standard deviation of all rounds. + +2. **Run without statistics** + +This section lists the run time information without statistics code. + the detailed explanation is the same as the section of Warm Up. + +3. **Run with statistics** + +This section lists the run time information with statistics code, + the time maybe longer compared with the second section. + the detailed explanation is the same as the section of Warm Up. + +4. **Sort by Run Order** + +This section lists the detailed run information of every operator in your model. +The operators is listed based on the run order, Every line is an operator of your model. +The detailed explanation is list as below. + +.. list-table:: + :header-rows: 1 + + * - Key + - Explanation + * - Op Type + - the type of operator. + * - Start + - the start time of the operator. unit is millisecond. + * - First + - the run time of first round. unit is millisecond. + * - Avg + - the average run time of all rounds. unit is millisecond. + * - % + - the percentage of total running time. + * - cdf% + - the cumulative percentage of running time. + * - GMACPS + - The number of run MACs(multiply-accumulation) per second. the unit is G/s. + * - Stride + - the stride parameter of the operator if exist. + * - Pad + - the pad parameter of the operator if exist. + * - Filter Shape + - the filter shape of the operator if exist. + * - Output Shape + - the output shape of the operator. + * - Dilation + - the dilation parameter of the operator if exist. + * - Name + - the name of the operator. + +5. **Sort by Computation time** + +This section lists the top-10 most time-consuming operators. +The operators is listed based on the computation time, +the detailed explanation is the same as previous section. + +6. **Stat by Op Type** + +This section stats the run information about operators based on operator type. + +.. list-table:: + :header-rows: 1 + + * - Op Type + - the type of operator. + * - Count + - the number of operators with the type. + * - Avg + - the average run time of the operator. unit is millisecond. + * - % + - the percentage of total running time. + * - cdf% + - the cumulative percentage of running time. + * - MACs + - The number of MACs(multiply-accumulation). + * - GMACPS + - The number of MACs(multiply-accumulation) runs per second. the unit is G/s. + * - Called times + - the number of called times in all rounds. + +7. **Stat by MACs** + +This section stats the MACs information of your model. + +.. list-table:: + :header-rows: 1 + + * - total + - the number of MACs of your model. + * - round + - the number of round has been run. + * - First + - the GMAPS of first round. unit is G/s. + * - Avg + - the average GMAPS of all rounds. unit is G/s. + * - std + - the standard deviation of all rounds. + +8. **Summary of Ops' Stat** + +This section lists the run time information which is summation of every operator's run time. +which may be shorter than the model's run time with statistics. +the detailed explanation is the same as the section of Warm Up. diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD index b086ad24..3fe9a006 100644 --- a/mace/benchmark/BUILD +++ b/mace/benchmark/BUILD @@ -15,6 +15,7 @@ cc_library( srcs = ["statistics.cc"], hdrs = ["statistics.h"], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], + visibility = ["//visibility:public"], deps = [ "//mace/utils", ], diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 7f0afe24..bcb9ae75 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -48,23 +48,6 @@ std::vector Split(const std::string &str, char delims) { return result; } -bool SplitAndParseToInts(const std::string &str, - char delims, - std::vector *result) { - std::string tmp = str; - while (!tmp.empty()) { - int64_t dim = atoi(tmp.data()); - result->push_back(dim); - size_t next_offset = tmp.find(delims); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return true; -} - } // namespace str_util void ParseShape(const std::string &str, std::vector *shape) { diff --git a/mace/benchmark/statistics.cc b/mace/benchmark/statistics.cc index 0f05798c..7329c247 100644 --- a/mace/benchmark/statistics.cc +++ b/mace/benchmark/statistics.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "mace/benchmark/statistics.h" @@ -53,7 +54,6 @@ std::string ShapeToString( if (output_shape.empty()) { return ""; } - std::stringstream stream; stream << "["; for (size_t i = 0; i < output_shape.size(); ++i) { @@ -94,6 +94,46 @@ std::string VectorToString(const std::vector &vec) { } // namespace + +int64_t StatMACs(const std::string &op_type, + const std::vector &filter_shape, + const std::vector &output_shape) { + int64_t macs = 0; + if (op_type == "Conv2D" || op_type == "Deconv2D") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] + * filter_shape[2] * filter_shape[3] * filter_shape[1]; + } else if (op_type == "MatMul") { + macs = std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()) + * filter_shape.back(); + } else if (op_type == "DepthwiseConv2d") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] * filter_shape[0] * filter_shape[2] * filter_shape[3]; + } else if (op_type == "DepthwiseDeconv2d") { + macs = output_shape[0] * output_shape[1] * output_shape[2] + * output_shape[3] * filter_shape[2] * filter_shape[3]; + } else if (op_type == "FullyConnected") { + macs = output_shape[0] * std::accumulate(filter_shape.begin(), + filter_shape.end(), + 1, + std::multiplies()); + } else if (op_type == "BatchNorm") { + macs = std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()); + } else if (op_type == "ResizeBilinear" || op_type == "ResizeBicubic") { + macs = 3 * std::accumulate(output_shape.begin(), + output_shape.end(), + 1, + std::multiplies()); + } + return macs; +} + void OpStat::StatMetadata(const RunMetadata &meta_data) { if (meta_data.op_stats.empty()) { LOG(FATAL) << "Op metadata should not be empty"; @@ -112,6 +152,8 @@ void OpStat::StatMetadata(const RunMetadata &meta_data) { record->type = op_stat.type; record->args = op_stat.args; record->output_shape = op_stat.output_shape; + record->macs = + StatMACs(op_stat.type, op_stat.args.kernels, op_stat.output_shape[0]); record->order = order_idx; order_idx += 1; } @@ -148,7 +190,7 @@ std::string OpStat::StatByMetric(const Metric metric, // generate string std::string title = "Sort by " + MetricToString(metric); const std::vector header = { - "Node Type", "Start", "First", "Avg(ms)", "%", "cdf%", + "Op Type", "Start", "First", "Avg(ms)", "%", "cdf%", "GMACPS", "Stride", "Pad", "Filter Shape", "Output Shape", "Dilation", "name" }; std::vector> data; @@ -169,6 +211,9 @@ std::string OpStat::StatByMetric(const Metric metric, FloatToString(record.rel_end.sum() * 100.f / total_time_.sum(), 3)); tuple.push_back( FloatToString(accumulate_time * 100.f / total_time_.sum(), 3)); + tuple.push_back(FloatToString( + record.macs < 1e-6 ? record.macs : + (record.macs * 1e-3) / record.rel_end.avg(), 3)); tuple.push_back(VectorToString(record.args.strides)); if (record.args.padding_type != -1) { tuple.push_back(PaddingTypeToString(record.args.padding_type)); @@ -184,40 +229,43 @@ std::string OpStat::StatByMetric(const Metric metric, return mace::string_util::StringFormatter::Table(title, header, data); } -std::string OpStat::StatByNodeType() const { +std::string OpStat::StatByOpType() const { if (records_.empty()) { return ""; } const int64_t round = total_time_.round(); int64_t total_time = 0; std::map type_time_map; + std::map type_macs_map; std::map type_count_map; std::map type_called_times_map; - std::set node_types_set; + std::set op_types_set; for (auto &record : records_) { - std::string node_type = record.second.type; - node_types_set.insert(node_type); + std::string op_type = record.second.type; + op_types_set.insert(op_type); - type_time_map[node_type] += record.second.rel_end.sum() / round; + type_time_map[op_type] += record.second.rel_end.sum() / round; + type_macs_map[op_type] += record.second.macs; total_time += record.second.rel_end.sum() / round; - type_count_map[node_type] += 1; - type_called_times_map[node_type] += record.second.called_times / round; + type_count_map[op_type] += 1; + type_called_times_map[op_type] += record.second.called_times / round; } - std::vector node_types(node_types_set.begin(), - node_types_set.end()); - std::sort(node_types.begin(), node_types.end(), + std::vector op_types(op_types_set.begin(), + op_types_set.end()); + std::sort(op_types.begin(), op_types.end(), [&](const std::string &lhs, const std::string &rhs) { return type_time_map[lhs] > type_time_map[rhs]; }); - std::string title = "Stat by node type"; + std::string title = "Stat by Op Type"; const std::vector header = { - "Node Type", "Count", "Avg(ms)", "%", "cdf%", "Called times" + "Op Type", "Count", "Avg(ms)", "%", "cdf%", "MACs", + "GMACPS", "Called times" }; float cdf = 0.0f; std::vector> data; - for (auto type : node_types) { + for (auto type : op_types) { const float avg_time = type_time_map[type] / 1000.0f; const float percentage = type_time_map[type] * 100.0f / total_time; cdf += percentage; @@ -228,12 +276,43 @@ std::string OpStat::StatByNodeType() const { tuple.push_back(FloatToString(avg_time, 3)); tuple.push_back(FloatToString(percentage, 3)); tuple.push_back(FloatToString(cdf, 3)); + tuple.push_back(IntToString(type_macs_map[type])); + tuple.push_back(FloatToString( + type_macs_map[type] < 1e-6 ? type_macs_map[type] : + (type_macs_map[type] * 1e-3) / type_time_map[type], 3)); tuple.push_back(IntToString(type_called_times_map[type])); data.emplace_back(tuple); } return mace::string_util::StringFormatter::Table(title, header, data); } + +std::string OpStat::StatByMACs() const { + if (records_.empty()) { + return ""; + } + const int64_t round = total_time_.round(); + int64_t count = 0; + for (auto &record : records_) { + count += record.second.macs; + } + + std::string title = "Stat by MACs(Multiply-Accumulation)"; + const std::vector header = { + "total", "round", "first(G/s)", "avg(G/s)", "std" + }; + + std::vector> data; + std::vector tuple; + tuple.push_back(IntToString(count)); + tuple.push_back(IntToString(round)); + tuple.push_back(FloatToString((count * 1e-3) / total_time_.first(), 3)); + tuple.push_back(FloatToString((count * 1e-3) / total_time_.avg(), 3)); + tuple.push_back(FloatToString(total_time_.std_deviation(), 3)); + data.emplace_back(tuple); + return mace::string_util::StringFormatter::Table(title, header, data); +} + std::string OpStat::Summary() const { std::stringstream stream; if (!records_.empty()) { @@ -252,9 +331,11 @@ void OpStat::PrintStat() const { stream << StatByMetric(Metric::RUN_ORDER, 0) << std::endl; // top-10 op stat by time stream << StatByMetric(Metric::COMPUTATION_TIME, 10) << std::endl; - // op stat by node type - stream << StatByNodeType() << std::endl; + // op stat by op type + stream << StatByOpType() << std::endl; } + // print MACs statistics + stream << StatByMACs(); // Print summary stream << Summary(); diff --git a/mace/benchmark/statistics.h b/mace/benchmark/statistics.h index 52f963e5..f0cf2be6 100644 --- a/mace/benchmark/statistics.h +++ b/mace/benchmark/statistics.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -33,11 +34,33 @@ class RunMetadata; namespace benchmark { +// stat the number of multiply-accumulate(MAC) +int64_t StatMACs(const std::string &op_type, + const std::vector &filter_shape, + const std::vector &output_shape); + template std::string IntToString(const IntType v) { std::stringstream stream; stream << v; - return stream.str(); + std::string src_str = stream.str(); + size_t size = src_str.size(); + size_t dst_size = size + ((size-1) / 3); + if (src_str[0] == '-') { + dst_size = size + ((size-2) / 3); + } + std::string result(dst_size, ','); + size_t dst_idx = dst_size - 1; + for (size_t src_idx = 0; src_idx < size; ++src_idx) { + if ((src_idx % 3) != 0 || src_idx == 0 || dst_idx == 0) { + result[dst_idx] = src_str[size - 1 - src_idx]; + } else { + dst_idx -= 1; + result[dst_idx] = src_str[size - 1 - src_idx]; + } + dst_idx -= 1; + } + return result; } template @@ -127,7 +150,7 @@ enum Metric { COMPUTATION_TIME, }; -class OpStat{ +class OpStat { public: void StatMetadata(const RunMetadata &meta_data); @@ -136,7 +159,8 @@ class OpStat{ private: std::string StatByMetric(const Metric metric, const int top_limit) const; - std::string StatByNodeType() const; + std::string StatByOpType() const; + std::string StatByMACs() const; std::string Summary() const; private: @@ -145,6 +169,7 @@ class OpStat{ std::string type; std::vector> output_shape; ConvPoolArgs args; + int64_t macs; int64_t order; TimeInfo start; TimeInfo rel_end; diff --git a/mace/core/net.cc b/mace/core/net.cc index 1732cfe1..7912a6d4 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -403,8 +403,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { std::string type = op->debug_def().type(); if (type.compare("Conv2D") == 0 || - type.compare("FusedConv2D") == 0 || + type.compare("Deconv2D") == 0 || type.compare("DepthwiseConv2d") == 0 || + type.compare("DepthwiseDeconv2d") == 0 || type.compare("Pooling") == 0) { strides = op->GetRepeatedArgs("strides"); padding_type = op->GetOptionalArg("padding", -1); @@ -415,6 +416,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { } else { kernels = op->Input(1)->shape(); } + } else if (type.compare("MatMul") == 0) { + bool transpose_a = op->GetOptionalArg("transpose_a", false); + kernels = op->Input(0)->shape(); + if (transpose_a) { + std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]); + } + } else if (type.compare("FullyConnected") == 0) { + kernels = op->Input(1)->shape(); } std::vector> output_shapes; diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc index 5da17509..57be33c2 100644 --- a/mace/core/testing/test_benchmark.cc +++ b/mace/core/testing/test_benchmark.cc @@ -28,7 +28,7 @@ namespace testing { static std::vector *all_benchmarks = nullptr; static int64_t bytes_processed; -static int64_t macc_processed; +static int64_t macs_processed = 0; static int64_t accum_time = 0; static int64_t start_time = 0; @@ -62,8 +62,8 @@ void Benchmark::Run(const char *pattern) { // Internal perf regression tools depends on the output formatting, // please keep in consistent when modifying printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)", - "Iterations", "Input(MB/s)", "MACC(G/s)"); - printf("%s\n", std::string(width + 44, '-').c_str()); + "Iterations", "Input(MB/s)", "GMACPS"); + printf("%s\n", std::string(width + 45, '-').c_str()); for (auto b : *all_benchmarks) { if (!std::regex_match(b->name_, match, regex)) continue; int iters; @@ -71,9 +71,9 @@ void Benchmark::Run(const char *pattern) { b->Run(&iters, &seconds); float mbps = (bytes_processed * 1e-6) / seconds; // MACCs or other computations - float gmaccs = (macc_processed * 1e-9) / seconds; + float gmacs = (macs_processed * 1e-9) / seconds; printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, b->name_.c_str(), - seconds * 1e9 / iters, iters, mbps, gmaccs); + seconds * 1e9 / iters, iters, mbps, gmacs); } } @@ -89,7 +89,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { int64_t iters = kMinIters; while (true) { bytes_processed = -1; - macc_processed = -1; + macs_processed = 0; RestartTiming(); (*benchmark_func_)(iters); StopTiming(); @@ -108,7 +108,7 @@ void Benchmark::Run(int *run_count, double *run_seconds) { } void BytesProcessed(int64_t n) { bytes_processed = n; } -void MaccProcessed(int64_t n) { macc_processed = n; } +void MacsProcessed(int64_t n) { macs_processed = n; } void RestartTiming() { accum_time = 0; start_time = NowMicros(); diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index b6c070c7..2eb91e40 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -42,7 +42,7 @@ class Benchmark { }; void BytesProcessed(int64_t); -void MaccProcessed(int64_t); +void MacsProcessed(int64_t); void RestartTiming(); void StartTiming(); void StopTiming(); diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 1d8c821d..f6e01a74 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -230,6 +230,7 @@ cc_test( linkstatic = 1, deps = [ "test", + "//mace/benchmark:statistics", "//mace/core:test_benchmark_main", "//third_party/eigen3", ], diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 76447e9b..6faf62ce 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -62,7 +62,6 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluBenchmark(iters, N, C, H, W); \ } \ @@ -119,7 +118,6 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluxBenchmark(iters, N, C, H, W); \ } \ @@ -179,7 +177,6 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ PreluBenchmark(iters, N, C, H, W); \ } \ @@ -235,7 +232,6 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TanhBenchmark(iters, N, C, H, W); \ } \ @@ -292,7 +288,6 @@ void SigmoidBenchmark( static void MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SigmoidBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index f5e11740..b9751557 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -59,7 +59,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * INPUTS * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ AddNBenchmark(iters, INPUTS, N, H, W, C); \ } \ diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index d3467e76..a6afcb07 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -75,7 +75,7 @@ void BatchNorm( static void MACE_BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ + mace::testing::MacsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BatchNorm(iters, N, C, H, W); \ } \ diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 9664a917..64264936 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -58,7 +58,6 @@ void BMBatchToSpace( MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMBatchToSpace(iters, N, C, H, W, ARG); \ } \ diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 9026ffb2..f0604d56 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -65,7 +65,6 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { static void MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BiasAdd(iters, N, C, H, W); \ } \ diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index f5f1df41..4ba0f64c 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -68,7 +68,6 @@ void FilterBufferToImage(int iters, static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * O * I * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FilterBufferToImage(iters, O, I, H, W); \ } \ diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index db5f8494..8ea6d139 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -61,7 +61,6 @@ void ChannelShuffle( MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ChannelShuffle(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index a43fc308..eaff9b44 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -49,7 +49,6 @@ void ConcatHelper(int iters, int concat_dim, int dim0, int dim1) { net.Run(); } const int64_t tot = static_cast(iters) * dim0 * dim1 * 2; - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { @@ -104,7 +103,6 @@ void OpenCLConcatHelper(int iters, const int64_t tot = static_cast(iters) * (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 91efff79..a0e78003 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -154,9 +155,10 @@ void Conv2d(int iters, (H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \ int64_t ow = \ (W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "Conv2D", {OC, C, KH, KW}, {N, oh, ow, OC}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Conv2d(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \ mace::Padding::P, OC); \ diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc index aad6f93d..5133a28a 100644 --- a/mace/ops/crop_benchmark.cc +++ b/mace/ops/crop_benchmark.cc @@ -44,7 +44,6 @@ void CropHelper(int iters, int crop_axis, int dim1, int offset) { net.RunOp(D); } const int64_t tot = static_cast(iters) * kDim0 * dim1 * dim1; - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { @@ -96,7 +95,6 @@ void OpenCLCropHelper(int iters, const int64_t tot = static_cast(iters) * (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); - mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 81be17c0..9a2c405d 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -90,9 +91,10 @@ static void Deconv2d(int iters, const int64_t tot = static_cast(iters) * N * C * H * W; \ int64_t oh = OH; \ int64_t ow = OW; \ - const int64_t macc = \ - static_cast(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "Deconv2D", {OC, C, KH, KW}, {N, OH, OW, OC}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Deconv2d(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \ mace::Padding::P, OC); \ diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index c9c6dd40..1283e432 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -62,7 +62,6 @@ void DepthToSpace( MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ DepthToSpace(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 4d44a9bc..c5aee849 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -115,9 +116,10 @@ void DepthwiseConv2d(int iters, (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ int64_t ow = \ (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ - const int64_t macc = \ - static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "DepthwiseConv2d", {M, C, KH, KW}, {N, oh, ow, C}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ mace::Padding::P, M); \ diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc index 081e10d2..a130ca1d 100644 --- a/mace/ops/depthwise_deconv2d_benchmark.cc +++ b/mace/ops/depthwise_deconv2d_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/operator.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -81,11 +82,12 @@ static void DepthwiseDeconv2d(int iters, ##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - const int64_t macc = \ - static_cast(iters) * N * H * W * KH * KW * C; \ - mace::testing::MaccProcessed(macc); \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "DepthwiseDeconv2d", {1, C, KH, KW}, {N, H, W, C}); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseDeconv2d(iters, N, C, H, W, KH, KW, S, P); \ + DepthwiseDeconv2d(iters, N, C, H, W, KH, KW, S, P); \ } \ MACE_BENCHMARK( \ MACE_BM_DEPTHWISE_DECONV2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##S##_##P\ diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 95808bc3..b75149bd 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -66,7 +66,6 @@ void EltwiseBenchmark( MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ EltwiseBenchmark( \ iters, static_cast(ELT_TYPE), N, H, W, C); \ diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index bb27c97d..bb6dcd80 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -14,6 +14,7 @@ #include +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -104,11 +105,12 @@ void FCBenchmark( #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ static void MACE_BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = \ - static_cast(iters) * N * C * H * W * OC + OC; \ + const int64_t macs = \ + static_cast(iters) * mace::benchmark::StatMACs( \ + "FullyConnected", {OC, H, W, C}, {N, 1, 1, OC}); \ const int64_t tot = \ static_cast(iters) * (N + OC) * C * H * W + OC; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ FCBenchmark(iters, N, H, W, C, OC); \ } \ diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc index 5e52875c..7fe4a0fb 100644 --- a/mace/ops/gather_benchmark.cc +++ b/mace/ops/gather_benchmark.cc @@ -66,7 +66,6 @@ void GatherBenchmark(int iters, MACE_BM_GATHER##_##N##_##IND##_##VOC##_##EMBED##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * IND * EMBED; \ - mace::testing::MaccProcessed(0); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ GatherBenchmark(iters, N, IND, VOC, EMBED); \ } \ diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc index b917c495..61207af0 100644 --- a/mace/ops/local_response_norm_benchmark.cc +++ b/mace/ops/local_response_norm_benchmark.cc @@ -59,7 +59,6 @@ static void LocalResponseNorm( MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ LocalResponseNorm(iters, N, C, H, W); \ } \ diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc index 6568025a..a3b96094 100644 --- a/mace/ops/lstmcell_benchmark.cc +++ b/mace/ops/lstmcell_benchmark.cc @@ -79,11 +79,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) { static void \ MACE_BM_LSTMCELL_##N##_##INPUT_SIZE##_##HIDDEN_UNITS##_##TYPE##_##DEVICE(\ int iters) { \ - const int64_t macc = \ + const int64_t macs = \ static_cast( \ iters) * N * (INPUT_SIZE + HIDDEN_UNITS) * 4 * HIDDEN_UNITS; \ const int64_t tot = static_cast(iters) * N * INPUT_SIZE; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot * (sizeof(TYPE))); \ LSTMCell(iters, N, INPUT_SIZE, HIDDEN_UNITS); \ } \ diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index f118e63f..1996587a 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -19,6 +19,7 @@ #include #include "public/gemmlowp.h" +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/gemm.h" #include "mace/ops/sgemm.h" @@ -223,9 +224,10 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { #define MACE_BM_MATMUL_FUNC(M, K, N, FUNC, TYPE) \ static void MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC(int iters) { \ - const int64_t macc = static_cast(iters) * M * K * N; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {K}, {M, N}); \ const int64_t tot = static_cast(iters) * (M + N) * K; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot * sizeof(TYPE)); \ MatmulBenchmark_##FUNC(iters, M, K, N); \ } \ @@ -377,9 +379,10 @@ void MatMulTransposeBenchmark( #define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H * W; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulBenchmark(iters, N, H, C, W); \ } \ @@ -392,9 +395,10 @@ void MatMulTransposeBenchmark( #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \ static void MACE_BM_MATMUL_##T_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H * W; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("MatMul", {C}, {N, H, W}); \ const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ MatMulTransposeBenchmark(iters, N, H, C, W); \ } \ diff --git a/mace/ops/memory_benchmark.cc b/mace/ops/memory_benchmark.cc index e3bb30a8..73f3bdeb 100644 --- a/mace/ops/memory_benchmark.cc +++ b/mace/ops/memory_benchmark.cc @@ -94,7 +94,6 @@ void MemoryAccessBenchmark_NHCW( static void MACE_BM_MEMORY_ACCESS_##N##_##H##_##W##_##C##_##ORDER( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot * sizeof(float)); \ MemoryAccessBenchmark_##ORDER(iters, N, H, W, C); \ } \ diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index fb7f4e14..0125b4f5 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -57,7 +57,6 @@ void Pad(int iters, int batch, int height, static void MACE_BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Pad(iters, N, H, W, C, PAD); \ } \ diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index c48cc877..880c0cad 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -81,7 +81,6 @@ void Pooling(int iters, ##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ PoolingType::PO); \ diff --git a/mace/ops/quantize_benchmark.cc b/mace/ops/quantize_benchmark.cc index 62a534b7..0c1493b8 100644 --- a/mace/ops/quantize_benchmark.cc +++ b/mace/ops/quantize_benchmark.cc @@ -82,7 +82,6 @@ void Dequantize(int iters, int count) { MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Quantize(iters, N); \ } \ @@ -97,7 +96,6 @@ void Dequantize(int iters, int count) { MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Dequantize(iters, N); \ } \ diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc index ec8807b0..663c3b45 100644 --- a/mace/ops/reduce_benchmark.cc +++ b/mace/ops/reduce_benchmark.cc @@ -60,7 +60,6 @@ void Reduce(int iters, int batch, int channels, MACE_BM_REDUCE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Reduce(iters, N, C, H, W); \ } \ diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc index 5ababeba..85e073fd 100644 --- a/mace/ops/resize_bicubic_benchmark.cc +++ b/mace/ops/resize_bicubic_benchmark.cc @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -69,9 +71,10 @@ void ResizeBicubicBenchmark(int iters, MACE_BM_RESIZE_BICUBIC_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("ResizeBicubic", {}, {N, H1, W1, C}); \ const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBicubicBenchmark(iters, N, C, H0, W0, H1, W1); \ } \ diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index bace4f10..ddc0f508 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -13,6 +13,8 @@ // limitations under the License. #include + +#include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" @@ -75,9 +77,10 @@ void ResizeBilinearBenchmark(int iters, MACE_BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\ ##DEVICE( \ int iters) { \ - const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t macs = static_cast(iters) * \ + mace::benchmark::StatMACs("ResizeBilinear", {}, {N, H1, W1, C}); \ const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ - mace::testing::MaccProcessed(macc); \ + mace::testing::MacsProcessed(macs); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ } \ diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc index 9630f696..9b7a915a 100644 --- a/mace/ops/reverse_benchmark.cc +++ b/mace/ops/reverse_benchmark.cc @@ -51,10 +51,9 @@ void Reverse(int iters, int batch, int channels, int height, int width) { #define MACE_BM_REVERSE_MACRO(N, C, H, W, TYPE, DEVICE) \ static void MACE_BM_REVERSE_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t macc = \ + const int64_t macs = \ static_cast(iters) * N * C * H * W; \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Reverse(iters, N, C, H, W); \ } \ diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 25095da5..819544b2 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -98,7 +98,6 @@ void SoftmaxBenchmark( static void MACE_BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SoftmaxBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index cacadfcd..168461de 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -64,7 +64,6 @@ void BMSpaceToBatch( MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ } \ diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 3311d618..6bd7755e 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -62,7 +62,6 @@ void SpaceToDepth( MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SpaceToDepth(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index b21da8f5..020c3214 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -65,7 +65,7 @@ void BMSplitHelper(int iters, MACE_BM_SPLIT_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::MaccProcessed(tot); \ + mace::testing::MacsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSplitHelper(iters, {N, H, W, C}, NO); \ } \ diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc index 353d8e7a..1d2a7aa3 100644 --- a/mace/ops/sqrdiff_mean_benchmark.cc +++ b/mace/ops/sqrdiff_mean_benchmark.cc @@ -63,7 +63,6 @@ void SqrDiffMean(int iters, int batch, int channels, MACE_BM_SQRDIFF_MEAN_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SqrDiffMean(iters, N, C, H, W); \ } \ diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc index f584239a..372f2f9d 100644 --- a/mace/ops/transpose_benchmark.cc +++ b/mace/ops/transpose_benchmark.cc @@ -58,7 +58,6 @@ void TransposeBenchmark(int iters, static void MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {H, W}, {1, 0}); \ } \ @@ -72,7 +71,6 @@ void TransposeBenchmark(int iters, MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\ DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TransposeBenchmark(iters, {N, C, H, W}, {D0, D1, D2, D3}); \ } \ diff --git a/mace/test/BUILD b/mace/test/BUILD index 63faecfe..283dd486 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD @@ -7,10 +7,10 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", - "if_hexagon_enabled", - "if_not_hexagon_enabled", - "if_openmp_enabled", "if_neon_enabled", + "if_openmp_enabled", + "if_android_armv7", + "if_hexagon_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -32,16 +32,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -62,16 +65,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -92,16 +98,19 @@ cc_test( "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", + "-fopenmp" + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", ]) + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", ]), linkopts = ["-fopenmp"], linkstatic = 1, diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 71f3cc14..96ad13a4 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -50,7 +50,7 @@ def ops_benchmark_stdout_processor(stdout, dev, abi): if len(parts) == 5 and parts[0].startswith("BM_"): metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6) metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] - metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] + metrics["%s.gmac_per_sec" % parts[0]] = parts[4] # platform = dev[YAMLKeyword.target_socs] # model = dev[YAMLKeyword.device_name] -- GitLab