Add MACC metrics in benchmark

d1d7302c · Liangliang He · 858b5c7f · d1d7302c · d1d7302c · d1d7302c
20 changed file
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -9,9 +9,9 @@
 #include <regex>
 #include <vector>

+#include "mace/core/testing/test_benchmark.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
-#include "mace/core/testing/test_benchmark.h"

 namespace mace {
 namespace testing {
@@ -19,7 +19,7 @@ namespace testing {
 static std::vector<Benchmark *> *all_benchmarks = nullptr;
 static std::string label;
 static int64_t bytes_processed;
-static int64_t items_processed;
+static int64_t macc_processed;
 static int64_t accum_time = 0;
 static int64_t start_time = 0;

@@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) {
    }
  }

-  printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
-  printf("%s\n", std::string(width + 22, '-').c_str());
+  printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
+         "Iterations", "Input(MB/s)", "MACC(G/s)");
+  printf("%s\n", std::string(width + 44, '-').c_str());
  for (auto b : *all_benchmarks) {
    if (!std::regex_match(b->name_, match, regex)) continue;
    for (auto arg : b->args_) {
@@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) {
      double seconds;
      b->Run(arg.first, arg.second, &iters, &seconds);

-      char buf[100];
-      std::string full_label = label;
-      if (bytes_processed > 0) {
-        snprintf(buf, sizeof(buf), " %.1fMB/s",
-                 (bytes_processed * 1e-6) / seconds);
-        full_label += buf;
-      }
-      if (items_processed > 0) {
-        snprintf(buf, sizeof(buf), " %.1fM items/s",
-                 (items_processed * 1e-6) / seconds);
-        full_label += buf;
-      }
-      printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters,
-             iters, full_label.c_str());
+      float mbps = (bytes_processed * 1e-6) / seconds;
+      // MACCs or other computations
+      float gmaccs = (macc_processed * 1e-9) / seconds;
+      printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, name,
+             seconds * 1e9 / iters, iters, mbps, gmaccs);
    }
  }
 }
@@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
    accum_time = 0;
    start_time = utils::NowMicros();
    bytes_processed = -1;
-    items_processed = -1;
+    macc_processed = -1;
    label.clear();
    if (fn0_) {
      (*fn0_)(iters);
@@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
 }

 void BytesProcessed(int64_t n) { bytes_processed = n; }
-void ItemsProcessed(int64_t n) { items_processed = n; }
+void MaccProcessed(int64_t n) { macc_processed = n; }
 void StartTiming() {
  if (start_time == 0) start_time = utils::NowMicros();
 }

--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -43,7 +43,7 @@ class Benchmark {

 void RunBenchmarks();
 void BytesProcessed(int64_t);
-void ItemsProcessed(int64_t);
+void MaccProcessed(int64_t);
 void StartTiming();
 void StopTiming();


--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -7,7 +7,7 @@
 static void foo(int iters) {
  static const int N = 32;
  const int64_t tot = static_cast<int64_t>(iters) * N;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));

  float *inp = new float[N];
@@ -26,7 +26,7 @@ BENCHMARK(foo);

 static void bar(int iters, int n) {
  const int64_t tot = static_cast<int64_t>(iters) * n;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));

  float *inp = new float[n];

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -51,21 +51,22 @@ static void ReluBenchmark(
 #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                          \
  BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)

-#define BM_RELU(N, C, H, W, TYPE)       \
-  BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_RELU(N, C, H, W)                 \
+  BM_RELU_MACRO(N, C, H, W, float, CPU);    \
+  BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELU_MACRO(N, C, H, W, half, OPENCL);

-BM_RELU(1, 1, 512, 512, float);
-BM_RELU(1, 3, 128, 128, float);
-BM_RELU(1, 3, 512, 512, float);
-BM_RELU(1, 32, 112, 112, float);
-BM_RELU(1, 64, 256, 256, float);
+BM_RELU(1, 1, 512, 512);
+BM_RELU(1, 3, 128, 128);
+BM_RELU(1, 3, 512, 512);
+BM_RELU(1, 32, 112, 112);
+BM_RELU(1, 64, 256, 256);

 template <DeviceType D, typename T>
 static void ReluxBenchmark(
@@ -112,21 +113,22 @@ static void ReluxBenchmark(
 #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::ItemsProcessed(tot);                                       \
+    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                           \
  BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)

-#define BM_RELUX(N, C, H, W, TYPE)       \
-  BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_RELUX(N, C, H, W)                 \
+  BM_RELUX_MACRO(N, C, H, W, float, CPU);    \
+  BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELUX_MACRO(N, C, H, W, half, OPENCL);

-BM_RELUX(1, 1, 512, 512, float);
-BM_RELUX(1, 3, 128, 128, float);
-BM_RELUX(1, 3, 512, 512, float);
-BM_RELUX(1, 32, 112, 112, float);
-BM_RELUX(1, 64, 256, 256, float);
+BM_RELUX(1, 1, 512, 512);
+BM_RELUX(1, 3, 128, 128);
+BM_RELUX(1, 3, 512, 512);
+BM_RELUX(1, 32, 112, 112);
+BM_RELUX(1, 64, 256, 256);

 template <DeviceType D, typename T>
 static void PreluBenchmark(
@@ -173,21 +175,22 @@ static void PreluBenchmark(
 #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;          \
-    mace::testing::ItemsProcessed(tot);                                       \
+    mace::testing::MaccProcessed(tot);                                        \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
    PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                           \
  BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)

-#define BM_PRELU(N, C, H, W, TYPE)       \
-  BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_PRELU(N, C, H, W)                 \
+  BM_PRELU_MACRO(N, C, H, W, float, CPU);    \
+  BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_PRELU_MACRO(N, C, H, W, half, OPENCL);

-BM_PRELU(1, 1, 512, 512, float);
-BM_PRELU(1, 3, 128, 128, float);
-BM_PRELU(1, 3, 512, 512, float);
-BM_PRELU(1, 32, 112, 112, float);
-BM_PRELU(1, 64, 256, 256, float);
+BM_PRELU(1, 1, 512, 512);
+BM_PRELU(1, 3, 128, 128);
+BM_PRELU(1, 3, 512, 512);
+BM_PRELU(1, 32, 112, 112);
+BM_PRELU(1, 64, 256, 256);

 template <DeviceType D, typename T>
 static void TanhBenchmark(
@@ -232,21 +235,22 @@ static void TanhBenchmark(
 #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE)                              \
  static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                       \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                          \
  }                                                                          \
  BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)

-#define BM_TANH(N, C, H, W, TYPE)       \
-  BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL);
+#define BM_TANH(N, C, H, W)                 \
+  BM_TANH_MACRO(N, C, H, W, float, CPU);    \
+  BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
+  BM_TANH_MACRO(N, C, H, W, half, OPENCL);

-BM_TANH(1, 1, 512, 512, float);
-BM_TANH(1, 3, 128, 128, float);
-BM_TANH(1, 3, 512, 512, float);
-BM_TANH(1, 32, 112, 112, float);
-BM_TANH(1, 64, 256, 256, float);
+BM_TANH(1, 1, 512, 512);
+BM_TANH(1, 3, 128, 128);
+BM_TANH(1, 3, 512, 512);
+BM_TANH(1, 32, 112, 112);
+BM_TANH(1, 64, 256, 256);

 template <DeviceType D, typename T>
 static void SigmoidBenchmark(
@@ -292,7 +296,7 @@ static void SigmoidBenchmark(
  static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);               \
  }                                                                  \

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  }
 }

-#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
-  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
-      int iters) {                                                          \
-    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
-    mace::testing::ItemsProcessed(tot);                                     \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
-    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
-  }                                                                         \
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                       \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(   \
+      int iters) {                                                            \
+    const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
+    mace::testing::MaccProcessed(tot);                                        \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                   \
+  }                                                                           \
  BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)

-#define BM_ADDN(INPUTS, N, H, W, C)       \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
+#define BM_ADDN(INPUTS, N, H, W, C)                 \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU);    \
  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
  BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);


--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -76,7 +76,7 @@ static void BatchNorm(
  static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
      int iters) {                                                     \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;   \
-    mace::testing::ItemsProcessed(tot);                                \
+    mace::testing::MaccProcessed(tot);                                \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                \
    BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
  }                                                                    \
@@ -84,7 +84,6 @@ static void BatchNorm(

 #define BM_BATCH_NORM(N, C, H, W)                 \
  BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU);    \
-  BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON);    \
  BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
  BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);


--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -41,7 +41,7 @@ static void BMBatchToSpace(
      BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
          int iters) {                                                     \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;       \
-    mace::testing::ItemsProcessed(tot);                                    \
+    mace::testing::MaccProcessed(tot);                                    \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
    BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                  \
  }                                                                        \

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
  static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                              \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                        \
  }                                                                  \

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -41,7 +41,7 @@ static void ChannelShuffle(
  static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
      int iters) {                                                       \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;     \
-    mace::testing::ItemsProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                  \
    mace::testing::BytesProcessed(tot *(sizeof(float)));                 \
    ChannelShuffle<DEVICE>(iters, N, C, H, W, G);                        \
  }                                                                      \

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
    net.RunOp(D);
  }
  const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2;
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
@@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters,
  const int64_t tot =
      static_cast<int64_t>(iters) *
      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
-  mace::testing::ItemsProcessed(tot);
+  mace::testing::MaccProcessed(tot);
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -83,8 +83,20 @@ static void Conv2d(int iters,
  static void                                                                                      \
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
          int iters) {                                                                             \
+    const int64_t dilation = 1;                                                                    \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                               \
-    mace::testing::ItemsProcessed(tot);                                                            \
+    int64_t pad_h = 0, pad_w = 0;                                                                  \
+    if (P == SAME) {                                                                               \
+      pad_h = KH / 2;                                                                              \
+      pad_w = KW / 2;                                                                              \
+    }                                                                                              \
+    int64_t oh =                                                                                   \
+        (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;                             \
+    int64_t ow =                                                                                   \
+        (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;                             \
+    const int64_t macc =                                                                           \
+        static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1);                        \
+    mace::testing::MaccProcessed(macc);                                                            \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                            \
    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                      \
                         OC);                                                                      \

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters,
  }
 }

-#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                                  \
-                                   DEVICE)                                                                   \
-  static void                                                                                                \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
-          int iters) {                                                                                       \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                         \
-    mace::testing::ItemsProcessed(tot);                                                                      \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                      \
-    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                         \
-                                  mace::Padding::P, OC);                                                     \
-  }                                                                                                          \
-  BENCHMARK(                                                                                                 \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
+#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE,                                  \
+                                   DEVICE)                                                                  \
+  static void                                                                                               \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
+          int iters) {                                                                                      \
+    const int64_t dilation = 1;                                                                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                        \
+    int64_t pad_h = 0, pad_w = 0;                                                                           \
+    if (P == SAME) {                                                                                        \
+      pad_h = KH / 2;                                                                                       \
+      pad_w = KW / 2;                                                                                       \
+    }                                                                                                       \
+    int64_t oh =                                                                                            \
+        (H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;                                      \
+    int64_t ow =                                                                                            \
+        (W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;                                      \
+    const int64_t macc =                                                                                    \
+        static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);                                  \
+    mace::testing::MaccProcessed(macc);                                                                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                                     \
+    DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,                                        \
+                                  mace::Padding::P, M);                                                     \
+  }                                                                                                         \
+  BENCHMARK(                                                                                                \
+      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)

-#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC)                 \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU);    \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
+#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M)                 \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU);    \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);

 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
      BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
          int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-    mace::testing::ItemsProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                  \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
    EltwiseBenchmark<DEVICE, TYPE>(                                      \
        iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \

--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
@@ -40,7 +40,7 @@ static void GlobalAvgPooling(
  static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
      int iters) {                                                    \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(float)));              \
    GlobalAvgPooling<DEVICE>(iters, N, C, H, W);                      \
  }                                                                   \

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -20,10 +20,8 @@ static void MatMulBenchmark(
  net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});

  if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(net, "A", "AImage",
-                            kernels::BufferType::IN_OUT_WIDTH);
-    BufferToImage<D, T>(net, "B", "BImage",
-                            kernels::BufferType::IN_OUT_HEIGHT);
+    BufferToImage<D, T>(net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
+    BufferToImage<D, T>(net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT);

    OpDefBuilder("MatMul", "MatMulBM")
        .Input("AImage")
@@ -52,16 +50,19 @@ static void MatMulBenchmark(
  net.Sync();
 }

-#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                      \
-  static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) {  \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
-    MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                  \
-  }                                                                  \
+#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE)                              \
+  static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
+    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W;          \
+    const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W);     \
+    mace::testing::MaccProcessed(macc);                                        \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
+    MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W);                          \
+  }                                                                            \
  BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)

-#define BM_MATMUL(N, H, C, W) \
+#define BM_MATMUL(N, H, C, W)                 \
+  BM_MATMUL_MACRO(N, H, C, W, float, CPU);    \
+  BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
  BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);

 BM_MATMUL(16, 32, 128, 49);

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -54,7 +54,7 @@ static void Pooling(int iters,
      BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
          int iters) {                                                              \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                \
-    mace::testing::ItemsProcessed(tot);                                             \
+    mace::testing::MaccProcessed(tot);                                             \
    mace::testing::BytesProcessed(tot *(sizeof(float)));                            \
    Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,                     \
                    PoolingType::PO);                                               \

--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters,
  static void                                                                       \
      BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \
          int iters) {                                                              \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H1 * W1;              \
-    mace::testing::ItemsProcessed(tot);                                             \
+    const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;         \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;              \
+    mace::testing::MaccProcessed(macc);                                             \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                             \
    ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);             \
  }                                                                                 \

--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -49,7 +49,7 @@ static void SoftmaxBenchmark(
 #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                      \
  static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) {  \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                              \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
    SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                  \
  }                                                                  \

--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
      BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
          int iters) {                                                       \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::ItemsProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                      \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
    BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                  \
  }                                                                          \

--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -41,7 +41,7 @@ static void BMWinogradTransform(
      BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
          int iters) {                                                \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
  }                                                                   \
@@ -93,7 +93,7 @@ static void BMWinogradInverseTransform(
      BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
          int iters) {                                                \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::ItemsProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
  }                                                                   \