From d1d7302c068c3e82e2619996eb2213689761d566 Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Sat, 24 Feb 2018 11:10:16 +0800
Subject: [PATCH] Add MACC metrics in benchmark

---
 mace/core/testing/test_benchmark.cc      | 32 ++++------
 mace/core/testing/test_benchmark.h       |  2 +-
 mace/examples/benchmark_example.cc       |  4 +-
 mace/ops/activation_benchmark.cc         | 78 +++++++++++++-----------
 mace/ops/addn_benchmark.cc               | 20 +++---
 mace/ops/batch_norm_benchmark.cc         |  3 +-
 mace/ops/batch_to_space_benchmark.cc     |  2 +-
 mace/ops/bias_add_benchmark.cc           |  2 +-
 mace/ops/channel_shuffle_benchmark.cc    |  2 +-
 mace/ops/concat_benchmark.cc             |  4 +-
 mace/ops/conv_2d_benchmark.cc            | 14 ++++-
 mace/ops/depthwise_conv2d_benchmark.cc   | 46 ++++++++------
 mace/ops/eltwise_benchmark.cc            |  2 +-
 mace/ops/global_avg_pooling_benchmark.cc |  2 +-
 mace/ops/matmul_benchmark.cc             | 25 ++++----
 mace/ops/pooling_benchmark.cc            |  2 +-
 mace/ops/resize_bilinear_benchmark.cc    |  5 +-
 mace/ops/softmax_benchmark.cc            |  2 +-
 mace/ops/space_to_batch_benchmark.cc     |  2 +-
 mace/ops/winograd_transform_benchmark.cc |  4 +-
 20 files changed, 137 insertions(+), 116 deletions(-)
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 513ec349..e458516f 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -9,9 +9,9 @@
 #include <regex>
 #include <vector>
 
+#include "mace/core/testing/test_benchmark.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
-#include "mace/core/testing/test_benchmark.h"
 
 namespace mace {
 namespace testing {
@@ -19,7 +19,7 @@ namespace testing {
 static std::vector<Benchmark *> *all_benchmarks = nullptr;
 static std::string label;
 static int64_t bytes_processed;
-static int64_t items_processed;
+static int64_t macc_processed;
 static int64_t accum_time = 0;
 static int64_t start_time = 0;
 
@@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) {
     }
   }
 
-  printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
-  printf("%s\n", std::string(width + 22, '-').c_str());
+  printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
+         "Iterations", "Input(MB/s)", "MACC(G/s)");
+  printf("%s\n", std::string(width + 44, '-').c_str());
   for (auto b : *all_benchmarks) {
     if (!std::regex_match(b->name_, match, regex)) continue;
     for (auto arg : b->args_) {
@@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) {
       double seconds;
       b->Run(arg.first, arg.second, &iters, &seconds);
 
-      char buf[100];
-      std::string full_label = label;
-      if (bytes_processed > 0) {
-        snprintf(buf, sizeof(buf), " %.1fMB/s",
-                 (bytes_processed * 1e-6) / seconds);
-        full_label += buf;
-      }
-      if (items_processed > 0) {
-        snprintf(buf, sizeof(buf), " %.1fM items/s",
-                 (items_processed * 1e-6) / seconds);
-        full_label += buf;
-      }
-      printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters,
-             iters, full_label.c_str());
+      float mbps = (bytes_processed * 1e-6) / seconds;
+      // MACCs or other computations
+      float gmaccs = (macc_processed * 1e-9) / seconds;
+      printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, name,
+             seconds * 1e9 / iters, iters, mbps, gmaccs);
     }
   }
 }
@@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
     accum_time = 0;
     start_time = utils::NowMicros();
     bytes_processed = -1;
-    items_processed = -1;
+    macc_processed = -1;
     label.clear();
     if (fn0_) {
       (*fn0_)(iters);
@@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
 }
 
 void BytesProcessed(int64_t n) { bytes_processed = n; }
-void ItemsProcessed(int64_t n) { items_processed = n; }
+void MaccProcessed(int64_t n) { macc_processed = n; }
 void StartTiming() {
   if (start_time == 0) start_time = utils::NowMicros();
 }
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 6d40ff75..7ecd3ea8 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -43,7 +43,7 @@ class Benchmark {
 
 void RunBenchmarks();
 void BytesProcessed(int64_t);
-void ItemsProcessed(int64_t);
+void MaccProcessed(int64_t);
 void StartTiming();
 void StopTiming();
 
diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc
index 93d1bd1a..f19cf2e7 100644
--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -7,7 +7,7 @@
 static void foo(int iters) {
   static const int N = 32;
   const int64_t tot = static_cast<int64_t>(iters) * N;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
   float *inp = new float[N];
@@ -26,7 +26,7 @@ BENCHMARK(foo);
 
 static void bar(int iters, int n) {
   const int64_t tot = static_cast<int64_t>(iters) * n;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
   float *inp = new float[n];
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 8a26e243..1037bdcb 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -51,21 +51,22 @@ static void ReluBenchmark(
 #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
   BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_RELU(N, C, H, W, TYPE)       \
-  BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_RELU(N, C, H, W)                 \
+  BM_RELU_MACRO(N, C, H, W, float, CPU);    \
+  BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELU_MACRO(N, C, H, W, half, OPENCL);
 
-BM_RELU(1, 1, 512, 512, float);
-BM_RELU(1, 3, 128, 128, float);
-BM_RELU(1, 3, 512, 512, float);
-BM_RELU(1, 32, 112, 112, float);
-BM_RELU(1, 64, 256, 256, float);
+BM_RELU(1, 1, 512, 512);
+BM_RELU(1, 3, 128, 128);
+BM_RELU(1, 3, 512, 512);
+BM_RELU(1, 32, 112, 112);
+BM_RELU(1, 64, 256, 256);
 
 template <DeviceType D, typename T>
 static void ReluxBenchmark(
@@ -112,21 +113,22 @@ static void ReluxBenchmark(
 #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::ItemsProcessed(tot);                                       \
+    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
   BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_RELUX(N, C, H, W, TYPE)       \
-  BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_RELUX(N, C, H, W)                 \
+  BM_RELUX_MACRO(N, C, H, W, float, CPU);    \
+  BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
 
-BM_RELUX(1, 1, 512, 512, float);
-BM_RELUX(1, 3, 128, 128, float);
-BM_RELUX(1, 3, 512, 512, float);
-BM_RELUX(1, 32, 112, 112, float);
-BM_RELUX(1, 64, 256, 256, float);
+BM_RELUX(1, 1, 512, 512);
+BM_RELUX(1, 3, 128, 128);
+BM_RELUX(1, 3, 512, 512);
+BM_RELUX(1, 32, 112, 112);
+BM_RELUX(1, 64, 256, 256);
 
 template <DeviceType D, typename T>
 static void PreluBenchmark(
@@ -173,21 +175,22 @@ static void PreluBenchmark(
 #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::ItemsProcessed(tot);                                       \
+    mace::testing::MaccProcessed(tot);                                        \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
     PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                           \
   BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_PRELU(N, C, H, W, TYPE)       \
-  BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_PRELU(N, C, H, W)                 \
+  BM_PRELU_MACRO(N, C, H, W, float, CPU);    \
+  BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
 
-BM_PRELU(1, 1, 512, 512, float);
-BM_PRELU(1, 3, 128, 128, float);
-BM_PRELU(1, 3, 512, 512, float);
-BM_PRELU(1, 32, 112, 112, float);
-BM_PRELU(1, 64, 256, 256, float);
+BM_PRELU(1, 1, 512, 512);
+BM_PRELU(1, 3, 128, 128);
+BM_PRELU(1, 3, 512, 512);
+BM_PRELU(1, 32, 112, 112);
+BM_PRELU(1, 64, 256, 256);
 
 template <DeviceType D, typename T>
 static void TanhBenchmark(
@@ -232,21 +235,22 @@ static void TanhBenchmark(
 #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE)                              \
   static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
   }                                                                          \
   BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_TANH(N, C, H, W, TYPE)       \
-  BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_TANH(N, C, H, W)                 \
+  BM_TANH_MACRO(N, C, H, W, float, CPU);    \
+  BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
+  BM_TANH_MACRO(N, C, H, W, half, OPENCL);
 
-BM_TANH(1, 1, 512, 512, float);
-BM_TANH(1, 3, 128, 128, float);
-BM_TANH(1, 3, 512, 512, float);
-BM_TANH(1, 32, 112, 112, float);
-BM_TANH(1, 64, 256, 256, float);
+BM_TANH(1, 1, 512, 512);
+BM_TANH(1, 3, 128, 128);
+BM_TANH(1, 3, 512, 512);
+BM_TANH(1, 32, 112, 112);
+BM_TANH(1, 64, 256, 256);
 
 template <DeviceType D, typename T>
 static void SigmoidBenchmark(
@@ -292,7 +296,7 @@ static void SigmoidBenchmark(
   static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);               \
   }                                                                  \
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index bd56e676..80544737 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
   }
 }
 
-#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
-  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
-      int iters) {                                                          \
-    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
-    mace::testing::ItemsProcessed(tot);                                     \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
-    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
-  }                                                                         \
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                       \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(   \
+      int iters) {                                                            \
+    const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
+    mace::testing::MaccProcessed(tot);                                        \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                   \
+  }                                                                           \
   BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
 
-#define BM_ADDN(INPUTS, N, H, W, C)       \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
+#define BM_ADDN(INPUTS, N, H, W, C)                 \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU);    \
   BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
   BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
 
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index ab2fa610..b0975aa3 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -76,7 +76,7 @@ static void BatchNorm(
   static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;   \
-    mace::testing::ItemsProcessed(tot);                                \
+    mace::testing::MaccProcessed(tot);                                \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                \
     BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                    \
@@ -84,7 +84,6 @@ static void BatchNorm(
 
 #define BM_BATCH_NORM(N, C, H, W)                 \
   BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU);    \
-  BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON);    \
   BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
   BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
 
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 46363f86..bac02236 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -41,7 +41,7 @@ static void BMBatchToSpace(
       BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
           int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;       \
-    mace::testing::ItemsProcessed(tot);                                    \
+    mace::testing::MaccProcessed(tot);                                    \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
     BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                  \
   }                                                                        \
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 7d091fd9..d59885de 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
   static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                              \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                  \
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index ca75ce10..a984b39d 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -41,7 +41,7 @@ static void ChannelShuffle(
   static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
       int iters) {                                                       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;     \
-    mace::testing::ItemsProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                  \
     mace::testing::BytesProcessed(tot *(sizeof(float)));                 \
     ChannelShuffle<DEVICE>(iters, N, C, H, W, G);                        \
   }                                                                      \
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 11d7de4b..82f66c56 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
     net.RunOp(D);
   }
   const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
@@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters,
   const int64_t tot =
       static_cast<int64_t>(iters) *
       (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
   testing::BytesProcessed(tot * sizeof(T));
   mace::testing::StartTiming();
   while (iters--) {
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 63c9df80..c7ba5a9d 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -83,8 +83,20 @@ static void Conv2d(int iters,
   static void                                                                                      \
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
           int iters) {                                                                             \
+    const int64_t dilation = 1;                                                                    \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                               \
-    mace::testing::ItemsProcessed(tot);                                                            \
+    int64_t pad_h = 0, pad_w = 0;                                                                  \
+    if (P == SAME) {                                                                               \
+      pad_h = KH / 2;                                                                              \
+      pad_w = KW / 2;                                                                              \
+    }                                                                                              \
+    int64_t oh =                                                                                   \
+        (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;                             \
+    int64_t ow =                                                                                   \
+        (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;                             \
+    const int64_t macc =                                                                           \
+        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);                        \
+    mace::testing::MaccProcessed(macc);                                                            \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                            \
     Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                      \
                          OC);                                                                      \
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 2f58343a..6ba3b000 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters,
   }
 }
 
-#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                                  \
-                                   DEVICE)                                                                   \
-  static void                                                                                                \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
-          int iters) {                                                                                       \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                         \
-    mace::testing::ItemsProcessed(tot);                                                                      \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                      \
-    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                         \
-                                  mace::Padding::P, OC);                                                     \
-  }                                                                                                          \
-  BENCHMARK(                                                                                                 \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
+#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE,                                  \
+                                   DEVICE)                                                                  \
+  static void                                                                                               \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
+          int iters) {                                                                                      \
+    const int64_t dilation = 1;                                                                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                        \
+    int64_t pad_h = 0, pad_w = 0;                                                                           \
+    if (P == SAME) {                                                                                        \
+      pad_h = KH / 2;                                                                                       \
+      pad_w = KW / 2;                                                                                       \
+    }                                                                                                       \
+    int64_t oh =                                                                                            \
+        (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;                                      \
+    int64_t ow =                                                                                            \
+        (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;                                      \
+    const int64_t macc =                                                                                    \
+        static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);                                  \
+    mace::testing::MaccProcessed(macc);                                                                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                     \
+    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                        \
+                                  mace::Padding::P, M);                                                     \
+  }                                                                                                         \
+  BENCHMARK(                                                                                                \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)
 
-#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC)                 \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU);    \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
+#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M)                 \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU);    \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
 
 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 2c3fcad4..01838562 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
       BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
           int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-    mace::testing::ItemsProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                  \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
     EltwiseBenchmark<DEVICE, TYPE>(                                      \
         iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc
index f4decad2..00b5471a 100644
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
@@ -40,7 +40,7 @@ static void GlobalAvgPooling(
   static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
       int iters) {                                                    \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(float)));              \
     GlobalAvgPooling<DEVICE>(iters, N, C, H, W);                      \
   }                                                                   \
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index 1850086d..f6e1c6d1 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -20,10 +20,8 @@ static void MatMulBenchmark(
   net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
 
   if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(net, "A", "AImage",
-                            kernels::BufferType::IN_OUT_WIDTH);
-    BufferToImage<D, T>(net, "B", "BImage",
-                            kernels::BufferType::IN_OUT_HEIGHT);
+    BufferToImage<D, T>(net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT);
 
     OpDefBuilder("MatMul", "MatMulBM")
         .Input("AImage")
@@ -52,16 +50,19 @@ static void MatMulBenchmark(
   net.Sync();
 }
 
-#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                      \
-  static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) {  \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
-    MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                  \
-  }                                                                  \
+#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                              \
+  static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
+    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
+    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
+    MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                          \
+  }                                                                            \
   BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
 
-#define BM_MATMUL(N, H, C, W) \
+#define BM_MATMUL(N, H, C, W)                 \
+  BM_MATMUL_MACRO(N, H, C, W, float, CPU);    \
+  BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
   BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);
 
 BM_MATMUL(16, 32, 128, 49);
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index 1a4d1925..fd673d42 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -54,7 +54,7 @@ static void Pooling(int iters,
       BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
           int iters) {                                                              \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                \
-    mace::testing::ItemsProcessed(tot);                                             \
+    mace::testing::MaccProcessed(tot);                                             \
     mace::testing::BytesProcessed(tot *(sizeof(float)));                            \
     Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,                     \
                     PoolingType::PO);                                               \
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index f582ede7..9daee298 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters,
   static void                                                                       \
       BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \
           int iters) {                                                              \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H1 * W1;              \
-    mace::testing::ItemsProcessed(tot);                                             \
+    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;         \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;              \
+    mace::testing::MaccProcessed(macc);                                             \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                             \
     ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);             \
   }                                                                                 \
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 5e8a283d..f2ccaf5b 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -49,7 +49,7 @@ static void SoftmaxBenchmark(
 #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                      \
   static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) {  \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                              \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                  \
   }                                                                  \
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index 86ba5808..ac643f94 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
       BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
           int iters) {                                                       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                      \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                  \
   }                                                                          \
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
index a7b99257..a8c0e77b 100644
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -41,7 +41,7 @@ static void BMWinogradTransform(
       BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
           int iters) {                                                \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
     BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
   }                                                                   \
@@ -93,7 +93,7 @@ static void BMWinogradInverseTransform(
       BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
           int iters) {                                                \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
     BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
   }                                                                   \
-- 
GitLab