From d1d7302c068c3e82e2619996eb2213689761d566 Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Sat, 24 Feb 2018 11:10:16 +0800 Subject: [PATCH] Add MACC metrics in benchmark --- mace/core/testing/test_benchmark.cc | 32 ++++------ mace/core/testing/test_benchmark.h | 2 +- mace/examples/benchmark_example.cc | 4 +- mace/ops/activation_benchmark.cc | 78 +++++++++++++----------- mace/ops/addn_benchmark.cc | 20 +++--- mace/ops/batch_norm_benchmark.cc | 3 +- mace/ops/batch_to_space_benchmark.cc | 2 +- mace/ops/bias_add_benchmark.cc | 2 +- mace/ops/channel_shuffle_benchmark.cc | 2 +- mace/ops/concat_benchmark.cc | 4 +- mace/ops/conv_2d_benchmark.cc | 14 ++++- mace/ops/depthwise_conv2d_benchmark.cc | 46 ++++++++------ mace/ops/eltwise_benchmark.cc | 2 +- mace/ops/global_avg_pooling_benchmark.cc | 2 +- mace/ops/matmul_benchmark.cc | 25 ++++---- mace/ops/pooling_benchmark.cc | 2 +- mace/ops/resize_bilinear_benchmark.cc | 5 +- mace/ops/softmax_benchmark.cc | 2 +- mace/ops/space_to_batch_benchmark.cc | 2 +- mace/ops/winograd_transform_benchmark.cc | 4 +- 20 files changed, 137 insertions(+), 116 deletions(-) diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc index 513ec349..e458516f 100644 --- a/mace/core/testing/test_benchmark.cc +++ b/mace/core/testing/test_benchmark.cc @@ -9,9 +9,9 @@ #include #include +#include "mace/core/testing/test_benchmark.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" -#include "mace/core/testing/test_benchmark.h" namespace mace { namespace testing { @@ -19,7 +19,7 @@ namespace testing { static std::vector *all_benchmarks = nullptr; static std::string label; static int64_t bytes_processed; -static int64_t items_processed; +static int64_t macc_processed; static int64_t accum_time = 0; static int64_t start_time = 0; @@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) { } } - printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations"); - printf("%s\n", std::string(width + 22, '-').c_str()); + printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)", + "Iterations", "Input(MB/s)", "MACC(G/s)"); + printf("%s\n", std::string(width + 44, '-').c_str()); for (auto b : *all_benchmarks) { if (!std::regex_match(b->name_, match, regex)) continue; for (auto arg : b->args_) { @@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) { double seconds; b->Run(arg.first, arg.second, &iters, &seconds); - char buf[100]; - std::string full_label = label; - if (bytes_processed > 0) { - snprintf(buf, sizeof(buf), " %.1fMB/s", - (bytes_processed * 1e-6) / seconds); - full_label += buf; - } - if (items_processed > 0) { - snprintf(buf, sizeof(buf), " %.1fM items/s", - (items_processed * 1e-6) / seconds); - full_label += buf; - } - printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters, - iters, full_label.c_str()); + float mbps = (bytes_processed * 1e-6) / seconds; + // MACCs or other computations + float gmaccs = (macc_processed * 1e-9) / seconds; + printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, name, + seconds * 1e9 / iters, iters, mbps, gmaccs); } } } @@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) { accum_time = 0; start_time = utils::NowMicros(); bytes_processed = -1; - items_processed = -1; + macc_processed = -1; label.clear(); if (fn0_) { (*fn0_)(iters); @@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) { } void BytesProcessed(int64_t n) { bytes_processed = n; } -void ItemsProcessed(int64_t n) { items_processed = n; } +void MaccProcessed(int64_t n) { macc_processed = n; } void StartTiming() { if (start_time == 0) start_time = utils::NowMicros(); } diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h index 6d40ff75..7ecd3ea8 100644 --- a/mace/core/testing/test_benchmark.h +++ b/mace/core/testing/test_benchmark.h @@ -43,7 +43,7 @@ class Benchmark { void RunBenchmarks(); void BytesProcessed(int64_t); -void ItemsProcessed(int64_t); +void MaccProcessed(int64_t); void StartTiming(); void StopTiming(); diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc index 93d1bd1a..f19cf2e7 100644 --- a/mace/examples/benchmark_example.cc +++ b/mace/examples/benchmark_example.cc @@ -7,7 +7,7 @@ static void foo(int iters) { static const int N = 32; const int64_t tot = static_cast(iters) * N; - mace::testing::ItemsProcessed(tot); + mace::testing::MaccProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); float *inp = new float[N]; @@ -26,7 +26,7 @@ BENCHMARK(foo); static void bar(int iters, int n) { const int64_t tot = static_cast(iters) * n; - mace::testing::ItemsProcessed(tot); + mace::testing::MaccProcessed(tot); mace::testing::BytesProcessed(tot * (sizeof(float))); float *inp = new float[n]; diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 8a26e243..1037bdcb 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -51,21 +51,22 @@ static void ReluBenchmark( #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluBenchmark(iters, N, C, H, W); \ } \ BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELU(N, C, H, W, TYPE) \ - BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \ - BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL); +#define BM_RELU(N, C, H, W) \ + BM_RELU_MACRO(N, C, H, W, float, CPU); \ + BM_RELU_MACRO(N, C, H, W, float, OPENCL); \ + BM_RELU_MACRO(N, C, H, W, half, OPENCL); -BM_RELU(1, 1, 512, 512, float); -BM_RELU(1, 3, 128, 128, float); -BM_RELU(1, 3, 512, 512, float); -BM_RELU(1, 32, 112, 112, float); -BM_RELU(1, 64, 256, 256, float); +BM_RELU(1, 1, 512, 512); +BM_RELU(1, 3, 128, 128); +BM_RELU(1, 3, 512, 512); +BM_RELU(1, 32, 112, 112); +BM_RELU(1, 64, 256, 256); template static void ReluxBenchmark( @@ -112,21 +113,22 @@ static void ReluxBenchmark( #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ReluxBenchmark(iters, N, C, H, W); \ } \ BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_RELUX(N, C, H, W, TYPE) \ - BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \ - BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL); +#define BM_RELUX(N, C, H, W) \ + BM_RELUX_MACRO(N, C, H, W, float, CPU); \ + BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \ + BM_RELUX_MACRO(N, C, H, W, half, OPENCL); -BM_RELUX(1, 1, 512, 512, float); -BM_RELUX(1, 3, 128, 128, float); -BM_RELUX(1, 3, 512, 512, float); -BM_RELUX(1, 32, 112, 112, float); -BM_RELUX(1, 64, 256, 256, float); +BM_RELUX(1, 1, 512, 512); +BM_RELUX(1, 3, 128, 128); +BM_RELUX(1, 3, 512, 512); +BM_RELUX(1, 32, 112, 112); +BM_RELUX(1, 64, 256, 256); template static void PreluBenchmark( @@ -173,21 +175,22 @@ static void PreluBenchmark( #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ PreluBenchmark(iters, N, C, H, W); \ } \ BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_PRELU(N, C, H, W, TYPE) \ - BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \ - BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL); +#define BM_PRELU(N, C, H, W) \ + BM_PRELU_MACRO(N, C, H, W, float, CPU); \ + BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \ + BM_PRELU_MACRO(N, C, H, W, half, OPENCL); -BM_PRELU(1, 1, 512, 512, float); -BM_PRELU(1, 3, 128, 128, float); -BM_PRELU(1, 3, 512, 512, float); -BM_PRELU(1, 32, 112, 112, float); -BM_PRELU(1, 64, 256, 256, float); +BM_PRELU(1, 1, 512, 512); +BM_PRELU(1, 3, 128, 128); +BM_PRELU(1, 3, 512, 512); +BM_PRELU(1, 32, 112, 112); +BM_PRELU(1, 64, 256, 256); template static void TanhBenchmark( @@ -232,21 +235,22 @@ static void TanhBenchmark( #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ TanhBenchmark(iters, N, C, H, W); \ } \ BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) -#define BM_TANH(N, C, H, W, TYPE) \ - BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \ - BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL); +#define BM_TANH(N, C, H, W) \ + BM_TANH_MACRO(N, C, H, W, float, CPU); \ + BM_TANH_MACRO(N, C, H, W, float, OPENCL); \ + BM_TANH_MACRO(N, C, H, W, half, OPENCL); -BM_TANH(1, 1, 512, 512, float); -BM_TANH(1, 3, 128, 128, float); -BM_TANH(1, 3, 512, 512, float); -BM_TANH(1, 32, 112, 112, float); -BM_TANH(1, 64, 256, 256, float); +BM_TANH(1, 1, 512, 512); +BM_TANH(1, 3, 128, 128); +BM_TANH(1, 3, 512, 512); +BM_TANH(1, 32, 112, 112); +BM_TANH(1, 64, 256, 256); template static void SigmoidBenchmark( @@ -292,7 +296,7 @@ static void SigmoidBenchmark( static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SigmoidBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index bd56e676..80544737 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { } } -#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ - static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - AddNBenchmark(iters, INPUTS, N, H, W, C); \ - } \ +#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ + static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * INPUTS * N * H * W * C; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + AddNBenchmark(iters, INPUTS, N, H, W, C); \ + } \ BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) -#define BM_ADDN(INPUTS, N, H, W, C) \ - BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ +#define BM_ADDN(INPUTS, N, H, W, C) \ + BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL); diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index ab2fa610..b0975aa3 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -76,7 +76,7 @@ static void BatchNorm( static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BatchNorm(iters, N, C, H, W); \ } \ @@ -84,7 +84,6 @@ static void BatchNorm( #define BM_BATCH_NORM(N, C, H, W) \ BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \ BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \ BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL); diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 46363f86..bac02236 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -41,7 +41,7 @@ static void BMBatchToSpace( BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMBatchToSpace(iters, N, C, H, W, ARG); \ } \ diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 7d091fd9..d59885de 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) { static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BiasAdd(iters, N, C, H, W); \ } \ diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index ca75ce10..a984b39d 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -41,7 +41,7 @@ static void ChannelShuffle( static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \ ChannelShuffle(iters, N, C, H, W, G); \ } \ diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 11d7de4b..82f66c56 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { net.RunOp(D); } const int64_t tot = static_cast(iters) * kDim0 * dim1 * 2; - mace::testing::ItemsProcessed(tot); + mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { @@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters, const int64_t tot = static_cast(iters) * (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); - mace::testing::ItemsProcessed(tot); + mace::testing::MaccProcessed(tot); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 63c9df80..c7ba5a9d 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -83,8 +83,20 @@ static void Conv2d(int iters, static void \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ + const int64_t dilation = 1; \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + int64_t pad_h = 0, pad_w = 0; \ + if (P == SAME) { \ + pad_h = KH / 2; \ + pad_w = KW / 2; \ + } \ + int64_t oh = \ + (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ + int64_t ow = \ + (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ + const int64_t macc = \ + static_cast(iters) * N * OC * oh * ow * (KH * KW * C + 1); \ + mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ Conv2d(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \ OC); \ diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 2f58343a..6ba3b000 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters, } } -#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ - DEVICE) \ - static void \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, OC); \ - } \ - BENCHMARK( \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) +#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \ + DEVICE) \ + static void \ + BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t dilation = 1; \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + int64_t pad_h = 0, pad_w = 0; \ + if (P == SAME) { \ + pad_h = KH / 2; \ + pad_w = KW / 2; \ + } \ + int64_t oh = \ + (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \ + int64_t ow = \ + (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \ + const int64_t macc = \ + static_cast(iters) * N * C * M * oh * ow * (KH * KW + 1); \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + DepthwiseConv2d(iters, N, C, H, W, KH, KW, STRIDE, \ + mace::Padding::P, M); \ + } \ + BENCHMARK( \ + BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE) -#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL); +#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1); diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 2c3fcad4..01838562 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -61,7 +61,7 @@ static void EltwiseBenchmark( BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * H * W * C; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ EltwiseBenchmark( \ iters, static_cast(ELT_TYPE), N, H, W, C); \ diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc index f4decad2..00b5471a 100644 --- a/mace/ops/global_avg_pooling_benchmark.cc +++ b/mace/ops/global_avg_pooling_benchmark.cc @@ -40,7 +40,7 @@ static void GlobalAvgPooling( static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \ GlobalAvgPooling(iters, N, C, H, W); \ } \ diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 1850086d..f6e1c6d1 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -20,10 +20,8 @@ static void MatMulBenchmark( net.AddRandomInput("B", {batch, channels, out_width, 1}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "A", "AImage", - kernels::BufferType::IN_OUT_WIDTH); - BufferToImage(net, "B", "BImage", - kernels::BufferType::IN_OUT_HEIGHT); + BufferToImage(net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH); + BufferToImage(net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT); OpDefBuilder("MatMul", "MatMulBM") .Input("AImage") @@ -52,16 +50,19 @@ static void MatMulBenchmark( net.Sync(); } -#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ - static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - MatMulBenchmark(iters, N, H, C, W); \ - } \ +#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ + static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t macc = static_cast(iters) * N * C * H * W; \ + const int64_t tot = static_cast(iters) * N * (C * H + H * W); \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + MatMulBenchmark(iters, N, H, C, W); \ + } \ BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) -#define BM_MATMUL(N, H, C, W) \ +#define BM_MATMUL(N, H, C, W) \ + BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ + BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \ BM_MATMUL_MACRO(N, H, C, W, half, OPENCL); BM_MATMUL(16, 32, 128, 49); diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 1a4d1925..fd673d42 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -54,7 +54,7 @@ static void Pooling(int iters, BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \ Pooling(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ PoolingType::PO); \ diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index f582ede7..9daee298 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters, static void \ BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \ int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H1 * W1; \ - mace::testing::ItemsProcessed(tot); \ + const int64_t macc = static_cast(iters) * N * C * H1 * W1 * 3; \ + const int64_t tot = static_cast(iters) * N * C * H0 * W0; \ + mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ ResizeBilinearBenchmark(iters, N, C, H0, W0, H1, W1); \ } \ diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 5e8a283d..f2ccaf5b 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -49,7 +49,7 @@ static void SoftmaxBenchmark( #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ SoftmaxBenchmark(iters, N, C, H, W); \ } \ diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index 86ba5808..ac643f94 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -42,7 +42,7 @@ static void BMSpaceToBatch( BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMSpaceToBatch(iters, N, H, W, C, SHAPE); \ } \ diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index a7b99257..a8c0e77b 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -41,7 +41,7 @@ static void BMWinogradTransform( BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradTransform(iters, N, H, W, C); \ } \ @@ -93,7 +93,7 @@ static void BMWinogradInverseTransform( BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::ItemsProcessed(tot); \ + mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ BMWinogradInverseTransform(iters, N, H, W, C); \ } \ -- GitLab