提交 3798dc9c 编写于 作者: L liuqi

Add model benchmark for counting the execution time of each Op.

上级
# Benchmark
# Examples
load(
"//:mace.bzl",
"if_production_mode",
"if_not_production_mode",
"if_hexagon_enabled",
"if_openmp_enabled",
)
licenses(["notice"]) # Apache 2.0
cc_library(
name = "stat_summarizer",
srcs = ["stat_summarizer.cc"],
hdrs = ["stat_summarizer.h"],
linkstatic = 1,
deps = [
"@mace//:mace_headers",
]
)
cc_binary(
name = "benchmark_model",
srcs = [
"benchmark_model.cc",
],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
":stat_summarizer",
"//codegen:generated_models",
"//external:gflags_nothreads",
] + if_hexagon_enabled([
"//lib/hexagon:hexagon",
])+ if_production_mode([
"@mace//:mace_prod",
"//codegen:generated_opencl_prod",
"//codegen:generated_tuning_params",
]) + if_not_production_mode([
"@mace//:mace_dev",
]),
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "gflags/gflags.h"
#include "mace/core/public/mace.h"
#include "benchmark/stat_summarizer.h"
#include <cstdlib>
#include <fstream>
#include <thread>
#include <iostream>
#include <numeric>
#include <sys/time.h>
namespace mace {
namespace MACE_MODEL_TAG {
extern NetDef CreateNet();
extern const std::string ModelChecksum();
}
}
namespace mace {
namespace str_util {
std::vector<std::string> Split(const std::string &str, char delims) {
std::vector<std::string> result;
std::string tmp = str;
while (!tmp.empty()) {
result.push_back(tmp.data());
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return result;
}
bool SplitAndParseToInts(const std::string &str,
char delims,
std::vector<int64_t> *result) {
std::string tmp = str;
while (!tmp.empty()) {
int64_t dim = atoi(tmp.data());
result->push_back(dim);
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return true;
}
} // namespace str_util
namespace benchmark {
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
bool RunInference(MaceEngine *engine,
const float *input,
const std::vector<int64_t> &input_shape,
float *output,
StatSummarizer *summarizer,
int64_t *inference_time_us) {
RunMetadata run_metadata;
RunMetadata *run_metadata_ptr = nullptr;
if (summarizer) {
run_metadata_ptr = &run_metadata;
}
const int64_t start_time = NowMicros();
bool s = engine->Run(input, input_shape, output, run_metadata_ptr);
const int64_t end_time = NowMicros();
if (!s) {
std::cerr << "Error during inference." << std::endl;
return s;
}
*inference_time_us = end_time - start_time;
if (summarizer != nullptr) {
summarizer->ProcessMetadata(run_metadata);
}
return true;
}
bool Run(MaceEngine *engine,
const float *input,
const std::vector<int64_t> &input_shape,
float *output,
StatSummarizer *summarizer,
int num_runs,
double max_time_sec,
int64_t sleep_sec,
int64_t *total_time_us,
int64_t *actual_num_runs) {
*total_time_us = 0;
std::cout << "Running benchmark for max " << num_runs << " iterators, max ";
std::cout << max_time_sec << " seconds ";
std::cout << (summarizer != nullptr ? "with " : "without ");
std::cout << "detailed stat logging, with " << sleep_sec;
std::cout << "s sleep between inferences" << std::endl;
Stat<int64_t> stat;
bool util_max_time = (num_runs <= 0);
for (int i = 0; util_max_time || i < num_runs; ++i) {
int64_t inference_time_us = 0;
bool s = RunInference(engine, input, input_shape, output, summarizer, &inference_time_us);
stat.UpdateStat(inference_time_us);
(*total_time_us) += inference_time_us;
++(*actual_num_runs);
if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
break;
}
if (!s) {
std::cout << "Failed on run " << i << std::endl;
return s;
}
if (sleep_sec > 0) {
std::this_thread::sleep_for(std::chrono::seconds(sleep_sec));
}
}
std::stringstream stream;
stat.OutputToStream(&stream);
std::cout << stream.str() << std::endl;
return true;
}
DEFINE_string(device, "CPU", "Device [CPU|OPENCL]");
DEFINE_string(input_shape, "", "input shape, separated by comma");
DEFINE_string(output_shape, "", "output shape, separated by comma");
DEFINE_string(input_file, "", "input file name");
DEFINE_int32(max_num_runs, 100, "number of runs max");
DEFINE_string(max_time, "10.0", "length to run max");
DEFINE_string(inference_delay, "-1", "delay between runs in seconds");
DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds");
DEFINE_string(benchmark_name, "", "benchmark name");
DEFINE_bool(show_run_order, true, "whether to list stats by run order");
DEFINE_int32(run_order_limit, 0, "how many items to show by run order");
DEFINE_bool(show_time, true, "whether to list stats by time taken");
DEFINE_int32(time_limit, 10, "how many items to show by time taken");
DEFINE_bool(show_memory, false, "whether to list stats by memory used");
DEFINE_int32(memory_limit, 10, "how many items to show by memory used");
DEFINE_bool(show_type, true, "whether to list stats by op type");
DEFINE_bool(show_summary, true, "whether to show a summary of the stats");
DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs");
DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
int Main(int argc, char **argv) {
gflags::SetUsageMessage("some usage message");
gflags::ParseCommandLineFlags(&argc, &argv, true);
std::vector<std::string> input_layer_shapes =
str_util::Split(FLAGS_input_shape, ',');
std::vector<int64_t> input_shape;
mace::str_util::SplitAndParseToInts(FLAGS_input_shape, ',', &input_shape);
std::vector<std::string> output_layer_shapes =
str_util::Split(FLAGS_output_shape, ',');
std::vector<int64_t> output_shape;
mace::str_util::SplitAndParseToInts(FLAGS_input_shape, ',', &output_shape);
std::cout << "Benchmark name: [" << FLAGS_benchmark_name << "]" << std::endl;
std::cout << "Device: [" << FLAGS_device << "]" << std::endl;
std::cout << "Input shapes: [" << FLAGS_input_shape << "]" << std::endl;
std::cout << "output shapes: [" << FLAGS_output_shape << "]" << std::endl;
std::cout << "Warmup runs: [" << FLAGS_warmup_runs << "]" << std::endl;
std::cout << "Num runs: [" << FLAGS_max_num_runs << "]" << std::endl;
std::cout << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]" << std::endl;
std::cout << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]" << std::endl;
const long int inter_inference_sleep_seconds =
std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10);
const long int inter_benchmark_sleep_seconds =
std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10);
const double max_benchmark_time_seconds =
std::strtod(FLAGS_max_time.c_str(), nullptr);
std::unique_ptr<StatSummarizer> stats;
StatSummarizerOptions stats_options;
stats_options.show_run_order = FLAGS_show_run_order;
stats_options.run_order_limit = FLAGS_run_order_limit;
stats_options.show_time = FLAGS_show_time;
stats_options.time_limit = FLAGS_time_limit;
stats_options.show_memory = FLAGS_show_memory;
stats_options.memory_limit = FLAGS_memory_limit;
stats_options.show_type = FLAGS_show_type;
stats_options.show_summary = FLAGS_show_summary;
stats.reset(new StatSummarizer(stats_options));
DeviceType device_type = CPU;
if(FLAGS_device == "OPENCL") {
device_type = OPENCL;
}
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet();
int64_t input_size = std::accumulate(input_shape.begin(),
input_shape.end(), 1, std::multiplies<int64_t>());
int64_t output_size = std::accumulate(output_shape.begin(),
output_shape.end(), 1, std::multiplies<int64_t>());
std::unique_ptr<float[]> input_data(new float[input_size]);
std::unique_ptr<float[]> output_data(new float[output_size]);
// load input
std::ifstream in_file(FLAGS_input_file, std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_data.get()),
input_size * sizeof(float));
in_file.close();
} else {
std::cout << "Open input file failed" << std::endl;
return -1;
}
// Init model
std::cout << "Run init" << std::endl;
mace::MaceEngine engine(&net_def, device_type);
std::cout << "Warm up" << std::endl;
int64_t warmup_time_us = 0;
int64_t num_warmup_runs = 0;
if (FLAGS_warmup_runs > 0) {
bool status =
Run(&engine, input_data.get(), input_shape, output_data.get(),
nullptr, FLAGS_warmup_runs, -1.0,
inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs);
if (!status) {
std::cerr << "Failed at warm up run" << std::endl;
}
}
if (inter_benchmark_sleep_seconds > 0) {
std::this_thread::sleep_for(
std::chrono::seconds(inter_benchmark_sleep_seconds));
}
int64_t no_stat_time_us = 0;
int64_t no_stat_runs = 0;
bool status =
Run(&engine, input_data.get(), input_shape, output_data.get(),
nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds,
inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs);
if (!status) {
std::cerr << "Failed at normal no-stat run" << std::endl;
}
int64_t stat_time_us = 0;
int64_t stat_runs = 0;
status = Run(&engine, input_data.get(), input_shape, output_data.get(),
stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds,
inter_inference_sleep_seconds, &stat_time_us, &stat_runs);
if (!status) {
std::cerr << "Failed at normal stat run" << std::endl;
}
std::cout << "Average inference timings in us: ";
std::cout << "Warmup: ";
std::cout << (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", ";
std::cout << "no stats: " << no_stat_time_us / no_stat_runs << ", ";
std::cout << "with stats: " << stat_time_us / stat_runs << std::endl;
stats->PrintOperatorStats();
return 0;
}
} // namespace benchmark
} // namespace mace
int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "benchmark/stat_summarizer.h"
#include "mace/core/public/mace.h"
#include <iomanip>
#include <queue>
#include <iostream>
namespace mace {
StatSummarizer::StatSummarizer(const StatSummarizerOptions &options)
: options_(options) {}
StatSummarizer::~StatSummarizer() {}
void StatSummarizer::Reset() {
run_total_us_.Reset();
memory_.Reset();
details_.clear();
}
void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
int64_t curr_total_us = 0;
int64_t mem_total = 0;
if (run_metadata.op_stats.empty()) {
std::cerr << "Runtime op stats should not be empty" << std::endl;
abort();
}
int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
int node_num = 0;
for (const auto &ops : run_metadata.op_stats) {
std::string name = ops.operator_name;
std::string op_type = ops.type;
++node_num;
const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
curr_total_us += curr_time;
auto result = details_.emplace(name, Detail());
Detail *detail = &(result.first->second);
detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
detail->rel_end_us.UpdateStat(curr_time);
// If this is the first pass, initialize some values.
if (result.second) {
detail->name = name;
detail->type = op_type;
detail->run_order = node_num;
detail->times_called = 0;
}
++detail->times_called;
}
run_total_us_.UpdateStat(curr_total_us);
memory_.UpdateStat(mem_total);
}
std::string StatSummarizer::ShortSummary() const {
std::stringstream stream;
stream << "Timings (microseconds): ";
run_total_us_.OutputToStream(&stream);
stream << std::endl;
stream << "Memory (bytes): ";
memory_.OutputToStream(&stream);
stream << std::endl;
stream << details_.size() << " nodes observed" << std::endl;
return stream.str();
}
std::ostream &InitField(std::ostream &stream, int width) {
stream << "\t" << std::right << std::setw(width) << std::fixed
<< std::setprecision(3);
return stream;
}
std::string StatSummarizer::HeaderString(const std::string &title) const {
std::stringstream stream;
stream << "============================== " << title
<< " ==============================" << std::endl;
InitField(stream, 14) << "[node type]";
InitField(stream, 9) << "[start]";
InitField(stream, 9) << "[first]";
InitField(stream, 9) << "[avg ms]";
InitField(stream, 8) << "[%]";
InitField(stream, 8) << "[cdf%]";
InitField(stream, 10) << "[mem KB]";
InitField(stream, 9) << "[times called]";
stream << "\t"
<< "[Name]";
return stream.str();
}
std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail,
const int64_t cumulative_stat_on_node,
const Stat<int64_t> &stat) const {
const double start_ms = detail.start_us.avg() / 1000.0;
const double first_time_ms = detail.rel_end_us.first() / 1000.0;
const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
const int64_t times_called = detail.times_called / num_runs();
std::stringstream stream;
InitField(stream, 14) << detail.type;
InitField(stream, 9) << start_ms;
InitField(stream, 9) << first_time_ms;
InitField(stream, 9) << avg_time_ms;
InitField(stream, 7) << percentage << "%";
InitField(stream, 7) << cdf_percentage << "%";
InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
InitField(stream, 9) << times_called;
stream << "\t" << detail.name;
return stream.str();
}
void StatSummarizer::OrderNodesByMetric(
SortingMetric metric, std::vector<const Detail *> *details) const {
std::priority_queue<std::pair<std::string, const Detail *>> sorted_list;
const int num_nodes = details_.size();
for (const auto &det : details_) {
const Detail *detail = &(det.second);
std::stringstream stream;
stream << std::setw(20) << std::right << std::setprecision(10)
<< std::fixed;
switch (metric) {
case BY_NAME:
stream << detail->name;
break;
case BY_RUN_ORDER:
stream << num_nodes - detail->run_order;
break;
case BY_TIME:
stream << detail->rel_end_us.avg();
break;
case BY_MEMORY:
stream << detail->mem_used.avg();
break;
case BY_TYPE:
stream << detail->type;
break;
default:
stream << "";
break;
}
sorted_list.emplace(stream.str(), detail);
}
while (!sorted_list.empty()) {
auto entry = sorted_list.top();
sorted_list.pop();
details->push_back(entry.second);
}
}
void StatSummarizer::ComputeStatsByType(
std::map<std::string, int64_t> *node_type_map_count,
std::map<std::string, int64_t> *node_type_map_time,
std::map<std::string, int64_t> *node_type_map_memory,
std::map<std::string, int64_t> *node_type_map_times_called,
int64_t *accumulated_us) const {
int64_t run_count = run_total_us_.count();
for (const auto &det : details_) {
const std::string node_name = det.first;
const Detail &detail = det.second;
int64_t curr_time_val =
static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
*accumulated_us += curr_time_val;
int64_t curr_memory_val = detail.mem_used.newest();
const std::string &node_type = detail.type;
(*node_type_map_count)[node_type] += 1;
(*node_type_map_time)[node_type] += curr_time_val;
(*node_type_map_memory)[node_type] += curr_memory_val;
(*node_type_map_times_called)[node_type] += detail.times_called / run_count;
}
}
std::string StatSummarizer::GetStatsByNodeType() const {
std::stringstream stream;
stream << "============================== Summary by node type "
"=============================="
<< std::endl;
std::cout << "Number of nodes executed: " << details_.size() << std::endl;
std::map<std::string, int64_t> node_type_map_count;
std::map<std::string, int64_t> node_type_map_time;
std::map<std::string, int64_t> node_type_map_memory;
std::map<std::string, int64_t> node_type_map_times_called;
int64_t accumulated_us = 0;
ComputeStatsByType(&node_type_map_count, &node_type_map_time,
&node_type_map_memory, &node_type_map_times_called,
&accumulated_us);
// Sort them.
std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
timings;
for (const auto &node_type : node_type_map_time) {
const int64_t mem_used = node_type_map_memory[node_type.first];
timings.emplace(node_type.second,
std::pair<std::string, int64_t>(node_type.first, mem_used));
}
InitField(stream, 14) << "[Node type]";
InitField(stream, 9) << "[count]";
InitField(stream, 10) << "[avg ms]";
InitField(stream, 11) << "[avg %]";
InitField(stream, 11) << "[cdf %]";
InitField(stream, 10) << "[mem KB]";
InitField(stream, 10) << "[times called]";
stream << std::endl;
float cdf = 0.0f;
while (!timings.empty()) {
auto entry = timings.top();
timings.pop();
const std::string node_type = entry.second.first;
const float memory = entry.second.second / 1000.0f;
const int64_t node_type_total_us = entry.first;
const float time_per_run_ms = node_type_total_us / 1000.0f;
const float percentage =
((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
cdf += percentage;
InitField(stream, 14) << node_type;
InitField(stream, 9) << node_type_map_count[node_type];
InitField(stream, 10) << time_per_run_ms;
InitField(stream, 10) << percentage << "%";
InitField(stream, 10) << cdf << "%";
InitField(stream, 10) << memory;
InitField(stream, 9) << node_type_map_times_called[node_type];
stream << std::endl;
}
stream << std::endl;
return stream.str();
}
std::string StatSummarizer::GetStatsByMetric(const std::string &title,
SortingMetric sorting_metric,
int num_stats) const {
std::vector<const Detail *> details;
OrderNodesByMetric(sorting_metric, &details);
double cumulative_stat_on_node = 0;
std::stringstream stream;
stream << HeaderString(title) << std::endl;
int stat_num = 0;
for (auto detail : details) {
++stat_num;
if (num_stats > 0 && stat_num > num_stats) {
break;
}
cumulative_stat_on_node += detail->rel_end_us.sum();
stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
<< std::endl;
}
stream << std::endl;
return stream.str();
}
std::string StatSummarizer::GetOutputString() const {
std::stringstream stream;
if (options_.show_run_order) {
stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
options_.run_order_limit);
}
if (options_.show_time) {
stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
options_.time_limit);
}
if (options_.show_memory) {
stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
options_.memory_limit);
}
if (options_.show_type) {
stream << GetStatsByNodeType();
}
if (options_.show_summary) {
stream << ShortSummary() << std::endl;
}
return stream.str();
}
void StatSummarizer::PrintOperatorStats() const {
std::string output = GetOutputString();
std::istringstream iss(output);
for (std::string line; std::getline(iss, line);) {
std::cout << line << std::endl;
}
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
#include <stdlib.h>
#include <cmath>
#include <limits>
#include <map>
#include <sstream>
#include <string>
#include <vector>
namespace mace {
class RunMetadata;
template <typename ValueType, typename HighPrecisionValueType = double>
class Stat {
public:
void UpdateStat(ValueType v) {
if (count_ == 0) {
first_ = v;
}
newest_ = v;
max_ = std::max(v, max_);
min_ = std::min(v, min_);
++count_;
sum_ += v;
squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
}
void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
bool empty() const { return count_ == 0; }
ValueType first() const { return first_; }
ValueType newest() const { return newest_; }
ValueType max() const { return max_; }
ValueType min() const { return min_; }
int64_t count() const { return count_; }
ValueType sum() const { return sum_; }
HighPrecisionValueType squared_sum() const { return squared_sum_; }
bool all_same() const { return (count_ == 0 || min_ == max_); }
HighPrecisionValueType avg() const {
return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
: static_cast<HighPrecisionValueType>(sum_) / count_;
}
ValueType std_deviation() const {
return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg());
}
void OutputToStream(std::ostream *stream) const {
if (empty()) {
*stream << "count=0";
} else if (all_same()) {
*stream << "count=" << count_ << " curr=" << newest_;
if (count_ > 1) *stream << "(all same)";
} else {
*stream << "count=" << count_ << " first=" << first_
<< " curr=" << newest_ << " min=" << min_ << " max=" << max_
<< " avg=" << avg() << " std=" << std_deviation();
}
}
friend std::ostream &operator<<(std::ostream &stream,
const Stat<ValueType> &stat) {
stat.OutputToStream(&stream);
return stream;
}
private:
ValueType first_ = 0;
ValueType newest_ = 0;
ValueType max_ = std::numeric_limits<ValueType>::min();
ValueType min_ = std::numeric_limits<ValueType>::max();
int64_t count_ = 0;
ValueType sum_ = 0;
HighPrecisionValueType squared_sum_ = 0;
};
// Used to control the output of the statistics summarizer;
class StatSummarizerOptions {
public:
StatSummarizerOptions()
: show_run_order(true),
run_order_limit(0),
show_time(true),
time_limit(10),
show_memory(true),
memory_limit(10),
show_type(true),
show_summary(true) {}
bool show_run_order;
int run_order_limit;
bool show_time;
int time_limit;
bool show_memory;
int memory_limit;
bool show_type;
bool show_summary;
};
// A StatSummarizer assists in performance analysis of Graph executions.
//
// It summarizes time spent executing (on GPU/CPU), memory used etc. across
// multiple executions of a single Graph from the StepStats collected during
// graph execution.
//
// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
class StatSummarizer {
public:
enum SortingMetric {
BY_NAME,
BY_RUN_ORDER,
BY_TIME,
BY_MEMORY,
BY_TYPE,
};
explicit StatSummarizer(const StatSummarizerOptions &options);
~StatSummarizer();
// Adds another run's StepStats output to the aggregate counts.
void ProcessMetadata(const RunMetadata &run_metadata);
// Returns a string detailing the accumulated runtime stats in a tab-separated
// format which can be pasted into a spreadsheet for further analysis.
std::string GetOutputString() const;
std::string ShortSummary() const;
// Prints the string returned by GetOutputString().
void PrintOperatorStats() const;
void ComputeStatsByType(
std::map<std::string, int64_t> *node_type_map_count,
std::map<std::string, int64_t> *node_type_map_time,
std::map<std::string, int64_t> *node_type_map_memory,
std::map<std::string, int64_t> *node_type_map_times_called,
int64_t *accumulated_us) const;
std::string GetStatsByNodeType() const;
std::string GetStatsByMetric(const std::string &title,
SortingMetric sorting_metric,
int num_stats) const;
void Reset();
// Returns number of runs.
int num_runs() const { return run_total_us_.count(); }
// Returns stats of total microseconds spent by all nodes in each run.
const Stat<int64_t> &run_total_us() const { return run_total_us_; }
private:
struct Detail {
std::string name;
std::string type;
int64_t run_order;
Stat<int64_t> start_us;
Stat<int64_t> rel_end_us;
Stat<int64_t> mem_used;
int64_t times_called;
};
void OrderNodesByMetric(SortingMetric sorting_metric,
std::vector<const Detail *> *details) const;
std::string HeaderString(const std::string &title) const;
std::string ColumnString(const Detail &detail,
const int64_t cumulative_stat_on_node,
const Stat<int64_t> &stat) const;
Stat<int64_t> run_total_us_;
Stat<int64_t> memory_;
std::map<std::string, Detail> details_;
StatSummarizerOptions options_;
};
} // namespace mace
#endif // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册