From 9d40ffd22cc95ae8e6db583038f694d5425169f1 Mon Sep 17 00:00:00 2001 From: liuqi Date: Tue, 17 Oct 2017 15:49:13 +0800 Subject: [PATCH] Add op run metadata statistics and benchmark framework. --- mace/core/BUILD | 2 + mace/core/net.cc | 15 +- mace/core/net.h | 5 +- mace/examples/BUILD | 1 - mace/examples/mace_run.cc | 5 + mace/proto/BUILD | 10 + mace/proto/stats.proto | 16 ++ mace/tools/benchmark/BUILD | 36 +++ mace/tools/benchmark/benchmark_model.cc | 333 ++++++++++++++++++++++++ mace/tools/benchmark/stat_summarizer.cc | 318 ++++++++++++++++++++++ mace/tools/benchmark/stat_summarizer.h | 198 ++++++++++++++ mace/utils/utils.h | 10 + 12 files changed, 945 insertions(+), 4 deletions(-) create mode 100644 mace/proto/stats.proto create mode 100644 mace/tools/benchmark/BUILD create mode 100644 mace/tools/benchmark/benchmark_model.cc create mode 100644 mace/tools/benchmark/stat_summarizer.cc create mode 100644 mace/tools/benchmark/stat_summarizer.h diff --git a/mace/core/BUILD b/mace/core/BUILD index b37f0cba..b1eab0b2 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -46,6 +46,8 @@ cc_library( copts = ["-std=c++11"], deps = [ "//mace/proto:cc_proto", + "//mace/proto:stats_proto", + "//mace/utils:utils", ], linkopts = if_android([ "-llog", diff --git a/mace/core/net.cc b/mace/core/net.cc index c5877f4a..80ccc16d 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -3,6 +3,7 @@ // #include "mace/core/net.h" +#include "mace/utils/utils.h" namespace mace { @@ -28,15 +29,27 @@ SimpleNet::SimpleNet(const std::shared_ptr& net_def, } } } -bool SimpleNet::Run() { +bool SimpleNet::Run(RunMetadata* run_metadata) { VLOG(1) << "Running net " << name_; for (auto& op : operators_) { VLOG(1) << "Running operator " << op->debug_def().name() << "(" << op->debug_def().type() << ")."; + OperatorStats* op_stats = nullptr; + if (run_metadata) { + op_stats = run_metadata->add_op_stats(); + op_stats->set_operator_name(op->debug_def().name()); + op_stats->set_type(op->debug_def().type()); + op_stats->set_all_start_micros(NowInMicroSec()); + op_stats->set_op_start_rel_micros(NowInMicroSec() - op_stats->all_start_micros()); + } if (!op->Run()) { LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def()); return false; } + if (op_stats) { + op_stats->set_op_end_rel_micros(NowInMicroSec() - op_stats->all_start_micros()); + op_stats->set_all_end_rel_micros(NowInMicroSec() - op_stats->all_start_micros()); + } VLOG(1) << "Op " << op->debug_def().name() << " has shape: " << internal::MakeString(op->Output(0)->shape()); } diff --git a/mace/core/net.h b/mace/core/net.h index 95608972..14b140fb 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -9,6 +9,7 @@ #include "mace/core/operator.h" #include "mace/core/workspace.h" #include "mace/proto/mace.pb.h" +#include "mace/proto/stats.pb.h" namespace mace { @@ -19,7 +20,7 @@ class NetBase { DeviceType type); virtual ~NetBase() noexcept {} - virtual bool Run() = 0; + virtual bool Run(RunMetadata* run_metadata = nullptr) = 0; const string& Name() const { return name_; } @@ -35,7 +36,7 @@ class SimpleNet : public NetBase { Workspace* ws, DeviceType type); - bool Run() override; + bool Run(RunMetadata* run_metadata = nullptr) override; protected: vector > operators_; diff --git a/mace/examples/BUILD b/mace/examples/BUILD index 91a1969d..0e5f062f 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -11,7 +11,6 @@ cc_binary( deps = [ "//mace/core", "//mace/ops", - "@org_tensorflow//tensorflow/core:android_tensorflow_lib", ], ) diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc index e969010c..fcd88ae1 100644 --- a/mace/examples/mace_run.cc +++ b/mace/examples/mace_run.cc @@ -101,6 +101,11 @@ int main(int argc, char **argv) { VLOG(0) << device_type; auto net = CreateNet(net_def, &ws, device_type); + // warm up + for (int i = 0; i < 2; ++i) { + net->Run(); + } + timeval tv1, tv2; gettimeofday(&tv1, NULL); for (int i = 0; i < round; ++i) { diff --git a/mace/proto/BUILD b/mace/proto/BUILD index ed124858..1ab23234 100644 --- a/mace/proto/BUILD +++ b/mace/proto/BUILD @@ -20,6 +20,16 @@ cc_proto_library( deps = [":proto"], ) +proto_library( + name = "stats", + srcs = ["stats.proto"], +) + +cc_proto_library( + name = "stats_proto", + deps = [":stats"], +) + py_proto_library( name = "mace_py", srcs = ["mace.proto"], diff --git a/mace/proto/stats.proto b/mace/proto/stats.proto new file mode 100644 index 00000000..6e06b467 --- /dev/null +++ b/mace/proto/stats.proto @@ -0,0 +1,16 @@ +syntax = "proto2"; + +package mace; + +message OperatorStats { + optional string operator_name = 1; + optional string type = 2; + optional int64 all_start_micros = 3; + optional int64 op_start_rel_micros = 4; + optional int64 op_end_rel_micros = 5; + optional int64 all_end_rel_micros = 6; +}; + +message RunMetadata { + repeated OperatorStats op_stats = 1; +} diff --git a/mace/tools/benchmark/BUILD b/mace/tools/benchmark/BUILD new file mode 100644 index 00000000..87377d99 --- /dev/null +++ b/mace/tools/benchmark/BUILD @@ -0,0 +1,36 @@ +# Benchmark +load("//mace:mace.bzl", "if_android") + +licenses(["notice"]) # Apache 2.0 + +cc_library( + name = "stat_summarizer", + srcs = ["stat_summarizer.cc"], + hdrs = ["stat_summarizer.h"], + copts = ["-std=c++11"], + linkopts = ["-fopenmp"] + if_android([ + "-ldl", + "-lm", + ]), + linkstatic = 1, + deps = [ + "//mace/core", + "//mace/proto:stats_proto", + ], +) + +cc_binary( + name = "benchmark_model", + srcs = [ + "benchmark_model.cc", + ], + copts = ["-std=c++11"], + linkopts = ["-fopenmp"] + if_android(["-ldl"]), + linkstatic = 1, + deps = [ + ":stat_summarizer", + "//mace/core", + "//mace/ops", + "//mace/utils:command_line_flags", + ], +) diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc new file mode 100644 index 00000000..3b74da18 --- /dev/null +++ b/mace/tools/benchmark/benchmark_model.cc @@ -0,0 +1,333 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/net.h" +#include "mace/utils/command_line_flags.h" +#include "mace/tools/benchmark/stat_summarizer.h" +#include "mace/utils/utils.h" + +#include +#include + + +namespace mace { +namespace str_util { + +std::vector Split(const string &str, char delims) { + std::vector result; + string tmp = str; + while (!tmp.empty()) { + result.push_back(tmp.data()); + size_t next_offset = tmp.find(delims); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + + +bool SplitAndParseToInts(const string &str, char delims, std::vector* result) { + string tmp = str; + while (!tmp.empty()) { + index_t dim = atoi(tmp.data()); + result->push_back(dim); + size_t next_offset = tmp.find(delims); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +} // namespace str_util + +namespace benchmark { + +bool RunInference(NetBase* net, StatSummarizer* summarizer, int64_t* inference_time_us) { + RunMetadata run_metadata; + RunMetadata* run_metadata_ptr = nullptr; + if (summarizer) { + run_metadata_ptr = &run_metadata; + } + const int64_t start_time = NowInMicroSec(); + bool s = net->Run(run_metadata_ptr); + const int64_t end_time = NowInMicroSec(); + + if (!s) { + LOG(ERROR) << "Error during inference."; + return s; + } + *inference_time_us = end_time - start_time; + + if (summarizer != nullptr) { + summarizer->ProcessMetadata(run_metadata); + } + + return true; +} + +bool Run(NetBase* net, StatSummarizer* summarizer, + int num_runs, double max_time_sec, int64_t sleep_sec, + int64_t* total_time_us, int64_t* actual_num_runs) { + *total_time_us = 0; + + LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max " + << max_time_sec << " seconds " + << (summarizer != nullptr ? "with " : "without ") + << "detailed stat logging, with " << sleep_sec + << "s sleep between inferences"; + + Stat stat; + + bool util_max_time = (num_runs <= 0); + for (int i = 0; util_max_time || i < num_runs ; ++i) { + int64_t inference_time_us = 0; + bool s = RunInference(net, summarizer, &inference_time_us); + stat.UpdateStat(inference_time_us); + (*total_time_us) += inference_time_us; + ++(*actual_num_runs); + + if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) { + break; + } + + if (!s) { + LOG(INFO) << "Failed on run " << i; + return s; + } + + if (sleep_sec > 0) { + std::this_thread::sleep_for(std::chrono::seconds(sleep_sec)); + } + } + + std::stringstream stream; + stat.OutputToStream(&stream); + LOG(INFO) << stream.str(); + + return true; +} + +int Main(int argc, char** argv) { + std::string model_file = "/data/local/tmp/mobi_mace.pb"; + std::string device = "CPU"; + std::string input_layer_string = "input:0"; + std::string input_layer_shape_string = "1,224,224,3"; + std::string input_layer_type_string = "float"; + std::string input_layer_files_string = ""; + std::string output_layer_string = "output:0"; + int max_num_runs = 10; + std::string max_time = "10.0"; + std::string inference_delay = "-1"; + std::string inter_benchmark_delay = "-1"; + int num_threads = -1; + std::string benchmark_name = ""; + std::string output_prefix = ""; + bool show_sizes = false; + bool show_run_order = true; + int run_order_limit = 0; + bool show_time = true; + int time_limit = 10; + bool show_memory = true; + int memory_limit = 10; + bool show_type = true; + bool show_summary = true; + bool show_flops = false; + int warmup_runs = 2; + + std::vector flag_list = { + Flag("model_file", &model_file, "graph file name"), + Flag("device", &device, "CPU/NEON"), + Flag("input_layer", &input_layer_string, "input layer names"), + Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"), + Flag("input_layer_type", &input_layer_type_string, "input layer type"), + Flag("input_layer_files", &input_layer_files_string, + "files to initialize the inputs with"), + Flag("output_layer", &output_layer_string, "output layer name"), + Flag("max_num_runs", &max_num_runs, "number of runs max"), + Flag("max_time", &max_time, "length to run max"), + Flag("inference_delay", &inference_delay, + "delay between runs in seconds"), + Flag("inter_benchmark_delay", &inter_benchmark_delay, + "delay between benchmarks in seconds"), + Flag("num_threads", &num_threads, "number of threads"), + Flag("benchmark_name", &benchmark_name, "benchmark name"), + Flag("output_prefix", &output_prefix, "benchmark output prefix"), + Flag("show_sizes", &show_sizes, "whether to show sizes"), + Flag("show_run_order", &show_run_order, + "whether to list stats by run order"), + Flag("run_order_limit", &run_order_limit, + "how many items to show by run order"), + Flag("show_time", &show_time, "whether to list stats by time taken"), + Flag("time_limit", &time_limit, "how many items to show by time taken"), + Flag("show_memory", &show_memory, "whether to list stats by memory used"), + Flag("memory_limit", &memory_limit, + "how many items to show by memory used"), + Flag("show_type", &show_type, "whether to list stats by op type"), + Flag("show_summary", &show_summary, + "whether to show a summary of the stats"), + Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"), + Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"), + }; + string usage = Flags::Usage(argv[0], flag_list); + const bool parse_result = Flags::Parse(&argc, argv, flag_list); + + if (!parse_result) { + LOG(ERROR) << usage; + return -1; + } + + std::vector input_layers = str_util::Split(input_layer_string, ','); + std::vector input_layer_shapes = str_util::Split(input_layer_shape_string, ':'); + std::vector input_layer_types = + str_util::Split(input_layer_type_string, ','); + std::vector input_layer_files = + str_util::Split(input_layer_files_string, ':'); + std::vector output_layers = str_util::Split(output_layer_string, ','); + if ((input_layers.size() != input_layer_shapes.size()) || + (input_layers.size() != input_layer_types.size())) { + LOG(ERROR) << "There must be the same number of items in --input_layer," + << " --input_layer_shape, and --input_layer_type, for example" + << " --input_layer=input1,input2 --input_layer_type=float,float " + << " --input_layer_shape=1,224,224,4:1,20"; + LOG(ERROR) << "--input_layer=" << input_layer_string << " (" + << input_layers.size() << " items)"; + LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " (" + << input_layer_types.size() << " items)"; + LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " (" + << input_layer_shapes.size() << " items)"; + return -1; + } + const size_t inputs_count = input_layers.size(); + + if (argc > 1) { + LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage; + return -1; + } + + LOG(INFO) << "Model file: [" << model_file << "]"; + LOG(INFO) << "Device: [" << device << "]"; + LOG(INFO) << "Input layers: [" << input_layer_string << "]"; + LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]"; + LOG(INFO) << "Input types: [" << input_layer_type_string << "]"; + LOG(INFO) << "Output layers: [" << output_layer_string << "]"; + LOG(INFO) << "Num runs: [" << max_num_runs << "]"; + LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]"; + LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay + << "]"; + LOG(INFO) << "Num threads: [" << num_threads << "]"; + LOG(INFO) << "Benchmark name: [" << benchmark_name << "]"; + LOG(INFO) << "Output prefix: [" << output_prefix << "]"; + LOG(INFO) << "Show sizes: [" << show_sizes << "]"; + LOG(INFO) << "Warmup runs: [" << warmup_runs << "]"; + + const long int inter_inference_sleep_seconds = + std::strtol(inference_delay.c_str(), nullptr, 10); + const long int inter_benchmark_sleep_seconds = + std::strtol(inter_benchmark_delay.c_str(), nullptr, 10); + const double max_benchmark_time_seconds = + std::strtod(max_time.c_str(), nullptr); + + std::unique_ptr stats; + + StatSummarizerOptions stats_options; + stats_options.show_run_order = show_run_order; + stats_options.run_order_limit = run_order_limit; + stats_options.show_time = show_time; + stats_options.time_limit = time_limit; + stats_options.show_memory = show_memory; + stats_options.memory_limit = memory_limit; + stats_options.show_type = show_type; + stats_options.show_summary = show_summary; + stats.reset(new StatSummarizer(stats_options)); + + // load model + std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary); + if (!model_file_stream.is_open()) { + LOG(ERROR) << "model file open failed"; + return -1; + } + NetDef net_def; + net_def.ParseFromIstream(&model_file_stream); + model_file_stream.close(); + + Workspace ws; + ws.LoadModelTensor(net_def, DeviceType::CPU); + // Load inputs + for (size_t i = 0; i < inputs_count; ++i) { + Tensor *input_tensor = ws.CreateTensor(input_layers[i], + cpu_allocator(), DT_FLOAT); + vector shapes; + str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes); + input_tensor->Resize(shapes); + float *input_data = input_tensor->mutable_data(); + + + // load input + if (i < input_layer_files.size()) { + std::ifstream in_file(input_layer_files[i], std::ios::in | std::ios::binary); + in_file.read(reinterpret_cast(input_data), + input_tensor->size() * sizeof(float)); + in_file.close(); + } + } + + // create net + DeviceType device_type; + DeviceType_Parse(device, &device_type); + auto net = CreateNet(net_def, &ws, device_type); + + int64_t warmup_time_us = 0; + int64_t num_warmup_runs = 0; + if (warmup_runs > 0) { + bool status = Run(net.get(), nullptr, + warmup_runs, -1.0, inter_inference_sleep_seconds, + &warmup_time_us, &num_warmup_runs); + if (!status) { + LOG(ERROR) << "Failed at warm up run"; + } + } + + if (inter_benchmark_sleep_seconds > 0) { + std::this_thread::sleep_for(std::chrono::seconds(inter_benchmark_sleep_seconds)); + } + int64_t no_stat_time_us = 0; + int64_t no_stat_runs = 0; + bool status = Run(net.get(), nullptr, + max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds, + &no_stat_time_us, &no_stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal no-stat run"; + } + + int64_t stat_time_us = 0; + int64_t stat_runs = 0; + status = Run(net.get(), stats.get(), + max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds, + &stat_time_us, &stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal stat run"; + } + + LOG(INFO) << "Average inference timings in us: " + << "Warmup: " + << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", " + << "no stats: " << no_stat_time_us / no_stat_runs << ", " + << "with stats: " << stat_time_us / stat_runs; + + stats->PrintOperatorStats(); + + return 0; +} + +} // namespace benchmark +} // namespace mace + +int main (int argc, char** argv) { + mace::benchmark::Main(argc, argv); +} diff --git a/mace/tools/benchmark/stat_summarizer.cc b/mace/tools/benchmark/stat_summarizer.cc new file mode 100644 index 00000000..ff3d504b --- /dev/null +++ b/mace/tools/benchmark/stat_summarizer.cc @@ -0,0 +1,318 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/common.h" +#include "mace/tools/benchmark/stat_summarizer.h" +#include "mace/proto/stats.pb.h" + +#include +#include + + +namespace mace { + +StatSummarizer::StatSummarizer(const StatSummarizerOptions& options) + : options_(options) {} + +StatSummarizer::~StatSummarizer() {} + +void StatSummarizer::Reset() { + run_total_us_.Reset(); + memory_.Reset(); + details_.clear(); +} + + +void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) { + int64_t curr_total_us = 0; + int64_t mem_total = 0; + + int64_t first_node_start_us = + run_metadata.op_stats(0).all_start_micros(); + + int node_num = 0; + for (const auto& ops : run_metadata.op_stats()) { + + std::string name = ops.operator_name(); + std::string op_type = ops.type(); + + ++node_num; + const int64_t curr_time = ops.all_end_rel_micros(); + curr_total_us += curr_time; + auto result = details_.emplace(name, Detail()); + Detail* detail = &(result.first->second); + + detail->start_us.UpdateStat(ops.all_start_micros() - first_node_start_us); + detail->rel_end_us.UpdateStat(curr_time); + + // If this is the first pass, initialize some values. + if (result.second) { + detail->name = name; + detail->type = op_type; + + detail->run_order = node_num; + + detail->times_called = 0; + } + + ++detail->times_called; + } + + run_total_us_.UpdateStat(curr_total_us); + memory_.UpdateStat(mem_total); +} + +std::string StatSummarizer::ShortSummary() const { + std::stringstream stream; + stream << "Timings (microseconds): "; + run_total_us_.OutputToStream(&stream); + stream << std::endl; + + stream << "Memory (bytes): "; + memory_.OutputToStream(&stream); + stream << std::endl; + + stream << details_.size() << " nodes observed" << std::endl; + return stream.str(); +} + +std::ostream& InitField(std::ostream& stream, int width) { + stream << "\t" << std::right << std::setw(width) << std::fixed + << std::setprecision(3); + return stream; +} + +std::string StatSummarizer::HeaderString(const std::string& title) const { + std::stringstream stream; + + stream << "============================== " << title + << " ==============================" << std::endl; + + InitField(stream, 24) << "[node type]"; + InitField(stream, 9) << "[start]"; + InitField(stream, 9) << "[first]"; + InitField(stream, 9) << "[avg ms]"; + InitField(stream, 8) << "[%]"; + InitField(stream, 8) << "[cdf%]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 9) << "[times called]"; + stream << "\t" + << "[Name]"; + return stream.str(); +} + +std::string StatSummarizer::ColumnString(const StatSummarizer::Detail& detail, + const int64_t cumulative_stat_on_node, + const Stat& stat) const { + const double start_ms = detail.start_us.avg() / 1000.0; + const double first_time_ms = detail.rel_end_us.first() / 1000.0; + const double avg_time_ms = detail.rel_end_us.avg() / 1000.0; + const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum(); + const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum(); + const int64_t times_called = detail.times_called / num_runs(); + + std::stringstream stream; + InitField(stream, 24) << detail.type; + InitField(stream, 9) << start_ms; + InitField(stream, 9) << first_time_ms; + InitField(stream, 9) << avg_time_ms; + InitField(stream, 7) << percentage << "%"; + InitField(stream, 7) << cdf_percentage << "%"; + InitField(stream, 10) << detail.mem_used.newest() / 1000.0; + InitField(stream, 9) << times_called; + stream << "\t" << detail.name; + + return stream.str(); +} + +void StatSummarizer::OrderNodesByMetric( + SortingMetric metric, std::vector* details) const { + std::priority_queue> sorted_list; + const int num_nodes = details_.size(); + + for (const auto& det : details_) { + const Detail* detail = &(det.second); + std::stringstream stream; + stream << std::setw(20) << std::right << std::setprecision(10) + << std::fixed; + + switch (metric) { + case BY_NAME: + stream << detail->name; + break; + case BY_RUN_ORDER: + stream << num_nodes - detail->run_order; + break; + case BY_TIME: + stream << detail->rel_end_us.avg(); + break; + case BY_MEMORY: + stream << detail->mem_used.avg(); + break; + case BY_TYPE: + stream << detail->type; + break; + default: + stream << ""; + break; + } + + sorted_list.emplace(stream.str(), detail); + } + + while (!sorted_list.empty()) { + auto entry = sorted_list.top(); + sorted_list.pop(); + details->push_back(entry.second); + } +} + +void StatSummarizer::ComputeStatsByType( + std::map* node_type_map_count, + std::map* node_type_map_time, + std::map* node_type_map_memory, + std::map* node_type_map_times_called, + int64_t* accumulated_us) const { + int64_t run_count = run_total_us_.count(); + + for (const auto& det : details_) { + const std::string node_name = det.first; + const Detail& detail = det.second; + + int64_t curr_time_val = + static_cast(detail.rel_end_us.sum() / run_count); + *accumulated_us += curr_time_val; + + int64_t curr_memory_val = detail.mem_used.newest(); + + const std::string& node_type = detail.type; + + (*node_type_map_count)[node_type] += 1; + (*node_type_map_time)[node_type] += curr_time_val; + (*node_type_map_memory)[node_type] += curr_memory_val; + (*node_type_map_times_called)[node_type] += detail.times_called / run_count; + } +} + +std::string StatSummarizer::GetStatsByNodeType() const { + std::stringstream stream; + + stream << "============================== Summary by node type " + "==============================" + << std::endl; + + LOG(INFO) << "Number of nodes executed: " << details_.size(); + + std::map node_type_map_count; + std::map node_type_map_time; + std::map node_type_map_memory; + std::map node_type_map_times_called; + int64_t accumulated_us = 0; + + ComputeStatsByType(&node_type_map_count, &node_type_map_time, + &node_type_map_memory, &node_type_map_times_called, + &accumulated_us); + + // Sort them. + std::priority_queue>> timings; + for (const auto& node_type : node_type_map_time) { + const int64_t mem_used = node_type_map_memory[node_type.first]; + timings.emplace(node_type.second, + std::pair(node_type.first, mem_used)); + } + + InitField(stream, 24) << "[Node type]"; + InitField(stream, 9) << "[count]"; + InitField(stream, 10) << "[avg ms]"; + InitField(stream, 11) << "[avg %]"; + InitField(stream, 11) << "[cdf %]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 10) << "[times called]"; + stream << std::endl; + + float cdf = 0.0f; + while (!timings.empty()) { + auto entry = timings.top(); + timings.pop(); + + const std::string node_type = entry.second.first; + const float memory = entry.second.second / 1000.0f; + + const int64_t node_type_total_us = entry.first; + const float time_per_run_ms = node_type_total_us / 1000.0f; + + const float percentage = + ((entry.first / static_cast(accumulated_us)) * 100.0f); + cdf += percentage; + + InitField(stream, 24) << node_type; + InitField(stream, 9) << node_type_map_count[node_type]; + InitField(stream, 10) << time_per_run_ms; + InitField(stream, 10) << percentage << "%"; + InitField(stream, 10) << cdf << "%"; + InitField(stream, 10) << memory; + InitField(stream, 9) << node_type_map_times_called[node_type]; + stream << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetStatsByMetric(const std::string& title, + SortingMetric sorting_metric, + int num_stats) const { + std::vector details; + OrderNodesByMetric(sorting_metric, &details); + + double cumulative_stat_on_node = 0; + + std::stringstream stream; + stream << HeaderString(title) << std::endl; + int stat_num = 0; + for (auto detail : details) { + ++stat_num; + if (num_stats > 0 && stat_num > num_stats) { + break; + } + + cumulative_stat_on_node += detail->rel_end_us.sum(); + stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_) + << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetOutputString() const { + std::stringstream stream; + if (options_.show_run_order) { + stream << GetStatsByMetric("Run Order", BY_RUN_ORDER, + options_.run_order_limit); + } + if (options_.show_time) { + stream << GetStatsByMetric("Top by Computation Time", BY_TIME, + options_.time_limit); + } + if (options_.show_memory) { + stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY, + options_.memory_limit); + } + if (options_.show_type) { + stream << GetStatsByNodeType(); + } + if (options_.show_summary) { + stream << ShortSummary() << std::endl; + } + return stream.str(); +} + +void StatSummarizer::PrintOperatorStats() const { + std::string output = GetOutputString(); + std::istringstream iss(output); + for (std::string line; std::getline(iss, line);) { + LOG(INFO) << line; + } +} + +} // namespace mace diff --git a/mace/tools/benchmark/stat_summarizer.h b/mace/tools/benchmark/stat_summarizer.h new file mode 100644 index 00000000..c8604e24 --- /dev/null +++ b/mace/tools/benchmark/stat_summarizer.h @@ -0,0 +1,198 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ +#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ + +#include +#include +#include +#include +#include +#include + + +namespace mace { + +class RunMetadata; + +template +class Stat { + public: + void UpdateStat(ValueType v) { + if (count_ == 0) { + first_ = v; + } + + newest_ = v; + max_ = std::max(v, max_); + min_ = std::min(v, min_); + ++count_; + sum_ += v; + squared_sum_ += static_cast(v) * v; + } + + void Reset() { new (this) Stat(); } + + bool empty() const { return count_ == 0; } + + ValueType first() const { return first_; } + + ValueType newest() const { return newest_; } + + ValueType max() const { return max_; } + + ValueType min() const { return min_; } + + int64_t count() const { return count_; } + + ValueType sum() const { return sum_; } + + HighPrecisionValueType squared_sum() const { return squared_sum_; } + + bool all_same() const { return (count_ == 0 || min_ == max_); } + + HighPrecisionValueType avg() const { + return empty() ? std::numeric_limits::quiet_NaN() + : static_cast(sum_) / count_; + } + + ValueType std_deviation() const { + return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg()); + } + + void OutputToStream(std::ostream* stream) const { + if (empty()) { + *stream << "count=0"; + } else if (all_same()) { + *stream << "count=" << count_ << " curr=" << newest_; + if (count_ > 1) *stream << "(all same)"; + } else { + *stream << "count=" << count_ << " first=" << first_ + << " curr=" << newest_ << " min=" << min_ << " max=" << max_ + << " avg=" << avg() << " std=" << std_deviation(); + } + } + + friend std::ostream& operator<<(std::ostream& stream, + const Stat& stat) { + stat.OutputToStream(&stream); + return stream; + } + + private: + ValueType first_ = 0; + ValueType newest_ = 0; + ValueType max_ = std::numeric_limits::min(); + ValueType min_ = std::numeric_limits::max(); + int64_t count_ = 0; + ValueType sum_ = 0; + HighPrecisionValueType squared_sum_ = 0; +}; + +// Used to control the output of the statistics summarizer; +class StatSummarizerOptions { + public: + StatSummarizerOptions() + : show_run_order(true), + run_order_limit(0), + show_time(true), + time_limit(10), + show_memory(true), + memory_limit(10), + show_type(true), + show_summary(true) {} + + bool show_run_order; + int run_order_limit; + bool show_time; + int time_limit; + bool show_memory; + int memory_limit; + bool show_type; + bool show_summary; +}; + +// A StatSummarizer assists in performance analysis of Graph executions. +// +// It summarizes time spent executing (on GPU/CPU), memory used etc. across +// multiple executions of a single Graph from the StepStats collected during +// graph execution. +// +// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage. +class StatSummarizer { + public: + enum SortingMetric { + BY_NAME, + BY_RUN_ORDER, + BY_TIME, + BY_MEMORY, + BY_TYPE, + }; + + explicit StatSummarizer(const StatSummarizerOptions& options); + + ~StatSummarizer(); + + // Adds another run's StepStats output to the aggregate counts. + void ProcessMetadata(const RunMetadata& run_metadata); + + // Returns a string detailing the accumulated runtime stats in a tab-separated + // format which can be pasted into a spreadsheet for further analysis. + std::string GetOutputString() const; + + std::string ShortSummary() const; + + // Prints the string returned by GetOutputString(). + void PrintOperatorStats() const; + + void ComputeStatsByType(std::map* node_type_map_count, + std::map* node_type_map_time, + std::map* node_type_map_memory, + std::map* node_type_map_times_called, + int64_t* accumulated_us) const; + + std::string GetStatsByNodeType() const; + + std::string GetStatsByMetric(const std::string& title, + SortingMetric sorting_metric, + int num_stats) const; + + void Reset(); + + // Returns number of runs. + int num_runs() const { return run_total_us_.count(); } + + // Returns stats of total microseconds spent by all nodes in each run. + const Stat& run_total_us() const { return run_total_us_; } + + private: + struct Detail { + std::string name; + std::string type; + int64_t run_order; + Stat start_us; + Stat rel_end_us; + Stat mem_used; + int64_t times_called; + }; + + void OrderNodesByMetric(SortingMetric sorting_metric, + std::vector* details) const; + + std::string HeaderString(const std::string& title) const; + std::string ColumnString(const Detail& detail, + const int64_t cumulative_stat_on_node, + const Stat& stat) const; + + Stat run_total_us_; + Stat memory_; + + std::map details_; + StatSummarizerOptions options_; +}; + +} // namespace mace + +#endif // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ diff --git a/mace/utils/utils.h b/mace/utils/utils.h index d8e07e10..1c075632 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -4,6 +4,9 @@ #ifndef MACE_UTILS_UTILS_H_ #define MACE_UTILS_UTILS_H_ + +#include + namespace mace { template Integer RoundUp(Integer i, Integer factor) { @@ -14,5 +17,12 @@ template Integer CeilQuotient(Integer a, Integer b) { return (a + b - 1) / b; } + +inline int64_t NowInMicroSec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec * 1000000 + tv.tv_usec); +} + } // namespace mace #endif // MACE_UTILS_UTILS_H_ -- GitLab