提交 d1d7302c 编写于 作者: L Liangliang He

Add MACC metrics in benchmark

上级 858b5c7f
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
#include <regex> #include <regex>
#include <vector> #include <vector>
#include "mace/core/testing/test_benchmark.h"
#include "mace/utils/env_time.h" #include "mace/utils/env_time.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/core/testing/test_benchmark.h"
namespace mace { namespace mace {
namespace testing { namespace testing {
...@@ -19,7 +19,7 @@ namespace testing { ...@@ -19,7 +19,7 @@ namespace testing {
static std::vector<Benchmark *> *all_benchmarks = nullptr; static std::vector<Benchmark *> *all_benchmarks = nullptr;
static std::string label; static std::string label;
static int64_t bytes_processed; static int64_t bytes_processed;
static int64_t items_processed; static int64_t macc_processed;
static int64_t accum_time = 0; static int64_t accum_time = 0;
static int64_t start_time = 0; static int64_t start_time = 0;
...@@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) { ...@@ -81,8 +81,9 @@ void Benchmark::Run(const char *pattern) {
} }
} }
printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations"); printf("%-*s %10s %10s %10s %10s\n", width, "Benchmark", "Time(ns)",
printf("%s\n", std::string(width + 22, '-').c_str()); "Iterations", "Input(MB/s)", "MACC(G/s)");
printf("%s\n", std::string(width + 44, '-').c_str());
for (auto b : *all_benchmarks) { for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue; if (!std::regex_match(b->name_, match, regex)) continue;
for (auto arg : b->args_) { for (auto arg : b->args_) {
...@@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) { ...@@ -98,20 +99,11 @@ void Benchmark::Run(const char *pattern) {
double seconds; double seconds;
b->Run(arg.first, arg.second, &iters, &seconds); b->Run(arg.first, arg.second, &iters, &seconds);
char buf[100]; float mbps = (bytes_processed * 1e-6) / seconds;
std::string full_label = label; // MACCs or other computations
if (bytes_processed > 0) { float gmaccs = (macc_processed * 1e-9) / seconds;
snprintf(buf, sizeof(buf), " %.1fMB/s", printf("%-*s %10.0f %10d %10.2f %10.2f\n", width, name,
(bytes_processed * 1e-6) / seconds); seconds * 1e9 / iters, iters, mbps, gmaccs);
full_label += buf;
}
if (items_processed > 0) {
snprintf(buf, sizeof(buf), " %.1fM items/s",
(items_processed * 1e-6) / seconds);
full_label += buf;
}
printf("%-*s %10.0f %10d\t%s\n", width, name, seconds * 1e9 / iters,
iters, full_label.c_str());
} }
} }
} }
...@@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) { ...@@ -130,7 +122,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
accum_time = 0; accum_time = 0;
start_time = utils::NowMicros(); start_time = utils::NowMicros();
bytes_processed = -1; bytes_processed = -1;
items_processed = -1; macc_processed = -1;
label.clear(); label.clear();
if (fn0_) { if (fn0_) {
(*fn0_)(iters); (*fn0_)(iters);
...@@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) { ...@@ -158,7 +150,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
} }
void BytesProcessed(int64_t n) { bytes_processed = n; } void BytesProcessed(int64_t n) { bytes_processed = n; }
void ItemsProcessed(int64_t n) { items_processed = n; } void MaccProcessed(int64_t n) { macc_processed = n; }
void StartTiming() { void StartTiming() {
if (start_time == 0) start_time = utils::NowMicros(); if (start_time == 0) start_time = utils::NowMicros();
} }
......
...@@ -43,7 +43,7 @@ class Benchmark { ...@@ -43,7 +43,7 @@ class Benchmark {
void RunBenchmarks(); void RunBenchmarks();
void BytesProcessed(int64_t); void BytesProcessed(int64_t);
void ItemsProcessed(int64_t); void MaccProcessed(int64_t);
void StartTiming(); void StartTiming();
void StopTiming(); void StopTiming();
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
static void foo(int iters) { static void foo(int iters) {
static const int N = 32; static const int N = 32;
const int64_t tot = static_cast<int64_t>(iters) * N; const int64_t tot = static_cast<int64_t>(iters) * N;
mace::testing::ItemsProcessed(tot); mace::testing::MaccProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float))); mace::testing::BytesProcessed(tot * (sizeof(float)));
float *inp = new float[N]; float *inp = new float[N];
...@@ -26,7 +26,7 @@ BENCHMARK(foo); ...@@ -26,7 +26,7 @@ BENCHMARK(foo);
static void bar(int iters, int n) { static void bar(int iters, int n) {
const int64_t tot = static_cast<int64_t>(iters) * n; const int64_t tot = static_cast<int64_t>(iters) * n;
mace::testing::ItemsProcessed(tot); mace::testing::MaccProcessed(tot);
mace::testing::BytesProcessed(tot * (sizeof(float))); mace::testing::BytesProcessed(tot * (sizeof(float)));
float *inp = new float[n]; float *inp = new float[n];
......
...@@ -51,21 +51,22 @@ static void ReluBenchmark( ...@@ -51,21 +51,22 @@ static void ReluBenchmark(
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \ #define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W, TYPE) \ #define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \ BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL); BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
BM_RELU_MACRO(N, C, H, W, half, OPENCL);
BM_RELU(1, 1, 512, 512, float); BM_RELU(1, 1, 512, 512);
BM_RELU(1, 3, 128, 128, float); BM_RELU(1, 3, 128, 128);
BM_RELU(1, 3, 512, 512, float); BM_RELU(1, 3, 512, 512);
BM_RELU(1, 32, 112, 112, float); BM_RELU(1, 32, 112, 112);
BM_RELU(1, 64, 256, 256, float); BM_RELU(1, 64, 256, 256);
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void ReluxBenchmark( static void ReluxBenchmark(
...@@ -112,21 +113,22 @@ static void ReluxBenchmark( ...@@ -112,21 +113,22 @@ static void ReluxBenchmark(
#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \ #define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELUX(N, C, H, W, TYPE) \ #define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, TYPE, CPU); \ BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, TYPE, OPENCL); BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
BM_RELUX(1, 1, 512, 512, float); BM_RELUX(1, 1, 512, 512);
BM_RELUX(1, 3, 128, 128, float); BM_RELUX(1, 3, 128, 128);
BM_RELUX(1, 3, 512, 512, float); BM_RELUX(1, 3, 512, 512);
BM_RELUX(1, 32, 112, 112, float); BM_RELUX(1, 32, 112, 112);
BM_RELUX(1, 64, 256, 256, float); BM_RELUX(1, 64, 256, 256);
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void PreluBenchmark( static void PreluBenchmark(
...@@ -173,21 +175,22 @@ static void PreluBenchmark( ...@@ -173,21 +175,22 @@ static void PreluBenchmark(
#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \ #define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_PRELU(N, C, H, W, TYPE) \ #define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, TYPE, CPU); \ BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, TYPE, OPENCL); BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
BM_PRELU(1, 1, 512, 512, float); BM_PRELU(1, 1, 512, 512);
BM_PRELU(1, 3, 128, 128, float); BM_PRELU(1, 3, 128, 128);
BM_PRELU(1, 3, 512, 512, float); BM_PRELU(1, 3, 512, 512);
BM_PRELU(1, 32, 112, 112, float); BM_PRELU(1, 32, 112, 112);
BM_PRELU(1, 64, 256, 256, float); BM_PRELU(1, 64, 256, 256);
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void TanhBenchmark( static void TanhBenchmark(
...@@ -232,21 +235,22 @@ static void TanhBenchmark( ...@@ -232,21 +235,22 @@ static void TanhBenchmark(
#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \ #define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TANH(N, C, H, W, TYPE) \ #define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, TYPE, CPU); \ BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, TYPE, OPENCL); BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
BM_TANH_MACRO(N, C, H, W, half, OPENCL);
BM_TANH(1, 1, 512, 512, float); BM_TANH(1, 1, 512, 512);
BM_TANH(1, 3, 128, 128, float); BM_TANH(1, 3, 128, 128);
BM_TANH(1, 3, 512, 512, float); BM_TANH(1, 3, 512, 512);
BM_TANH(1, 32, 112, 112, float); BM_TANH(1, 32, 112, 112);
BM_TANH(1, 64, 256, 256, float); BM_TANH(1, 64, 256, 256);
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void SigmoidBenchmark( static void SigmoidBenchmark(
...@@ -292,7 +296,7 @@ static void SigmoidBenchmark( ...@@ -292,7 +296,7 @@ static void SigmoidBenchmark(
static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -55,18 +55,18 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
} }
} }
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \ #define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \ AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \ } \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C) \ #define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL); BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
......
...@@ -76,7 +76,7 @@ static void BatchNorm( ...@@ -76,7 +76,7 @@ static void BatchNorm(
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \ BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
...@@ -84,7 +84,6 @@ static void BatchNorm( ...@@ -84,7 +84,6 @@ static void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W) \ #define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \ BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \ BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL); BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
......
...@@ -41,7 +41,7 @@ static void BMBatchToSpace( ...@@ -41,7 +41,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \ BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \ BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \ } \
......
...@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \ BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -41,7 +41,7 @@ static void ChannelShuffle( ...@@ -41,7 +41,7 @@ static void ChannelShuffle(
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \ static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \
ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \ ChannelShuffle<DEVICE>(iters, N, C, H, W, G); \
} \ } \
......
...@@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { ...@@ -29,7 +29,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
net.RunOp(D); net.RunOp(D);
} }
const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2; const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2;
mace::testing::ItemsProcessed(tot); mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
...@@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters, ...@@ -80,7 +80,7 @@ static void OpenclConcatHelper(int iters,
const int64_t tot = const int64_t tot =
static_cast<int64_t>(iters) * static_cast<int64_t>(iters) *
(net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
mace::testing::ItemsProcessed(tot); mace::testing::MaccProcessed(tot);
testing::BytesProcessed(tot * sizeof(T)); testing::BytesProcessed(tot * sizeof(T));
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
......
...@@ -83,8 +83,20 @@ static void Conv2d(int iters, ...@@ -83,8 +83,20 @@ static void Conv2d(int iters,
static void \ static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \ Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
OC); \ OC); \
......
...@@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters, ...@@ -75,24 +75,36 @@ static void DepthwiseConv2d(int iters,
} }
} }
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \ DEVICE) \
static void \ static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t dilation = 1; \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ int64_t pad_h = 0, pad_w = 0; \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \ if (P == SAME) { \
mace::Padding::P, OC); \ pad_h = KH / 2; \
} \ pad_w = KW / 2; \
BENCHMARK( \ } \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \ #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL); BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
......
...@@ -61,7 +61,7 @@ static void EltwiseBenchmark( ...@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \ const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \ EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \ iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
......
...@@ -40,7 +40,7 @@ static void GlobalAvgPooling( ...@@ -40,7 +40,7 @@ static void GlobalAvgPooling(
static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \ static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \
GlobalAvgPooling<DEVICE>(iters, N, C, H, W); \ GlobalAvgPooling<DEVICE>(iters, N, C, H, W); \
} \ } \
......
...@@ -20,10 +20,8 @@ static void MatMulBenchmark( ...@@ -20,10 +20,8 @@ static void MatMulBenchmark(
net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1}); net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(net, "A", "AImage", BufferToImage<D, T>(net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
kernels::BufferType::IN_OUT_WIDTH); BufferToImage<D, T>(net, "B", "BImage", kernels::BufferType::IN_OUT_HEIGHT);
BufferToImage<D, T>(net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulBM") OpDefBuilder("MatMul", "MatMulBM")
.Input("AImage") .Input("AImage")
...@@ -52,16 +50,19 @@ static void MatMulBenchmark( ...@@ -52,16 +50,19 @@ static void MatMulBenchmark(
net.Sync(); net.Sync();
} }
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ #define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::MaccProcessed(macc); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
} \ MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W) \ #define BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
BM_MATMUL_MACRO(N, H, C, W, half, OPENCL); BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);
BM_MATMUL(16, 32, 128, 49); BM_MATMUL(16, 32, 128, 49);
......
...@@ -54,7 +54,7 @@ static void Pooling(int iters, ...@@ -54,7 +54,7 @@ static void Pooling(int iters,
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \ BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \ mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \ Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \ PoolingType::PO); \
......
...@@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters, ...@@ -61,8 +61,9 @@ static void ResizeBilinearBenchmark(int iters,
static void \ static void \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \ BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H1 * W1; \ const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
mace::testing::ItemsProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \ ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \ } \
......
...@@ -49,7 +49,7 @@ static void SoftmaxBenchmark( ...@@ -49,7 +49,7 @@ static void SoftmaxBenchmark(
#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \ #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \ static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \ SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \ } \
......
...@@ -42,7 +42,7 @@ static void BMSpaceToBatch( ...@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \ BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \ BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \ } \
......
...@@ -41,7 +41,7 @@ static void BMWinogradTransform( ...@@ -41,7 +41,7 @@ static void BMWinogradTransform(
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \ } \
...@@ -93,7 +93,7 @@ static void BMWinogradInverseTransform( ...@@ -93,7 +93,7 @@ static void BMWinogradInverseTransform(
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \ } \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册