提交 d1d7302c 编写于 作者: L Liangliang He

Add MACC metrics in benchmark

上级 858b5c7f
......@@ -9,9 +9,9 @@
#include <regex>
#include <vector>
#include "mace/core/testing/test_benchmark.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
#include "mace/core/testing/test_benchmark.h"
namespace mace {
namespace testing {
......@@ -19,7 +19,7 @@ namespace testing {
static std::vector<Benchmark *> *all_benchmarks = nullptr;
static std::string label;
static int64_t bytes_processed;
static int64_t items_processed;
static int64_t macc_processed;
static int64_t accum_time = 0;
static int64_t start_time = 0;
......@@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) {
}
}
printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
printf("%s\n", std::string(width + 22, '-').c_str());
printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
"Iterations", "Input(MB/s)", "MACC(G/s)");
printf("%s\n", std::string(width + 44, '-').c_str());
for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue;
for (auto arg : b->args_) {
......@@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) {
double seconds;
b->Run(arg.first, arg.second, &iters, &seconds);
char buf[100];
std::string full_label = label;
if (bytes_processed > 0) {
snprintf(buf, sizeof(buf), " %.1fMB/s",
(bytes_processed * 1e-6) / seconds);
full_label += buf;
}
if (items_processed > 0) {
snprintf(buf, sizeof(buf), " %.1fM items/s",
(items_processed * 1e-6) / seconds);
full_label += buf;
}
printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters,
iters, full_label.c_str());
float mbps = (bytes_processed * 1e-6) / seconds;
// MACCs or other computations
float gmaccs = (macc_processed * 1e-9) / seconds;
printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, name,
seconds * 1e9 / iters, iters, mbps, gmaccs);
}
}
}
......@@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
accum_time = 0;
start_time = utils::NowMicros();
bytes_processed = -1;
items_processed = -1;
macc_processed = -1;
label.clear();
if (fn0_) {
(*fn0_)(iters);
......@@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
}
void BytesProcessed(int64_t n) { bytes_processed = n; }
void ItemsProcessed(int64_t n) { items_processed = n; }
void MaccProcessed(int64_t n) { macc_processed = n; }
void StartTiming() {
if (start_time == 0) start_time = utils::NowMicros();
}
......
......@@ -43,7 +43,7 @@ class Benchmark {
void RunBenchmarks();
void BytesProcessed(int64_t);
void ItemsProcessed(int64_t);
void MaccProcessed(int64_t);
void StartTiming();
void StopTiming();
......
......@@ -7,7 +7,7 @@
static void foo(int iters) {
static const int N = 32;
const int64_t tot = static_cast<int64_t>(iters) * N;
mace::testing::ItemsProcessed(tot);
mace::testing::MaccProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
float *inp = new float[N];
......@@ -26,7 +26,7 @@ BENCHMARK(foo);
static void bar(int iters, int n) {
const int64_t tot = static_cast<int64_t>(iters) * n;
mace::testing::ItemsProcessed(tot);
mace::testing::MaccProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float)));
float *inp = new float[n];
......
......@@ -51,21 +51,22 @@ static void ReluBenchmark(
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W, TYPE) \
BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
BM_RELU_MACRO(N, C, H, W, half, OPENCL);
BM_RELU(1, 1, 512, 512, float);
BM_RELU(1, 3, 128, 128, float);
BM_RELU(1, 3, 512, 512, float);
BM_RELU(1, 32, 112, 112, float);
BM_RELU(1, 64, 256, 256, float);
BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128);
BM_RELU(1, 3, 512, 512);
BM_RELU(1, 32, 112, 112);
BM_RELU(1, 64, 256, 256);
template <DeviceType D, typename T>
static void ReluxBenchmark(
......@@ -112,21 +113,22 @@ static void ReluxBenchmark(
#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELUX(N, C, H, W, TYPE) \
BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \
BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
BM_RELUX(1, 1, 512, 512, float);
BM_RELUX(1, 3, 128, 128, float);
BM_RELUX(1, 3, 512, 512, float);
BM_RELUX(1, 32, 112, 112, float);
BM_RELUX(1, 64, 256, 256, float);
BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128);
BM_RELUX(1, 3, 512, 512);
BM_RELUX(1, 32, 112, 112);
BM_RELUX(1, 64, 256, 256);
template <DeviceType D, typename T>
static void PreluBenchmark(
......@@ -173,21 +175,22 @@ static void PreluBenchmark(
#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_PRELU(N, C, H, W, TYPE) \
BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \
BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
BM_PRELU(1, 1, 512, 512, float);
BM_PRELU(1, 3, 128, 128, float);
BM_PRELU(1, 3, 512, 512, float);
BM_PRELU(1, 32, 112, 112, float);
BM_PRELU(1, 64, 256, 256, float);
BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128);
BM_PRELU(1, 3, 512, 512);
BM_PRELU(1, 32, 112, 112);
BM_PRELU(1, 64, 256, 256);
template <DeviceType D, typename T>
static void TanhBenchmark(
......@@ -232,21 +235,22 @@ static void TanhBenchmark(
#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TANH(N, C, H, W, TYPE) \
BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \
BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
BM_TANH_MACRO(N, C, H, W, half, OPENCL);
BM_TANH(1, 1, 512, 512, float);
BM_TANH(1, 3, 128, 128, float);
BM_TANH(1, 3, 512, 512, float);
BM_TANH(1, 32, 112, 112, float);
BM_TANH(1, 64, 256, 256, float);
BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128);
BM_TANH(1, 3, 512, 512);
BM_TANH(1, 32, 112, 112);
BM_TANH(1, 64, 256, 256);
template <DeviceType D, typename T>
static void SigmoidBenchmark(
......@@ -292,7 +296,7 @@ static void SigmoidBenchmark(
static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
}
}
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
......
......@@ -76,7 +76,7 @@ static void BatchNorm(
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......@@ -84,7 +84,6 @@ static void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
......
......@@ -41,7 +41,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
......
......@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -41,7 +41,7 @@ static void ChannelShuffle(
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \
} \
......
......@@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
net.RunOp(D);
}
const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2;
mace::testing::ItemsProcessed(tot);
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......@@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters,
const int64_t tot =
static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::ItemsProcessed(tot);
mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming();
while (iters--) {
......
......@@ -83,8 +83,20 @@ static void Conv2d(int iters,
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
OC); \
......
......@@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters,
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
......
......@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
......@@ -40,7 +40,7 @@ static void GlobalAvgPooling(
static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
GlobalAvgPooling<DEVICE>(iters, N, C, H, W); \
} \
......
......@@ -20,10 +20,8 @@ static void MatMulBenchmark(
net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "A", "AImage",
kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
BufferToImage<D, T>(net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulBM")
.Input("AImage")
......@@ -52,16 +50,19 @@ static void MatMulBenchmark(
net.Sync();
}
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W) \
#define BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);
BM_MATMUL(16, 32, 128, 49);
......
......@@ -54,7 +54,7 @@ static void Pooling(int iters,
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
......
......@@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters,
static void \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H1 * W1; \
mace::testing::ItemsProcessed(tot); \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
......
......@@ -49,7 +49,7 @@ static void SoftmaxBenchmark(
#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
......
......@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
......
......@@ -41,7 +41,7 @@ static void BMWinogradTransform(
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
......@@ -93,7 +93,7 @@ static void BMWinogradInverseTransform(
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册