Add op run metadata statistics and benchmark framework.

9d40ffd2 · liuqi · c0b8d045 · 9d40ffd2 · 9d40ffd2 · 9d40ffd2
12 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -46,6 +46,8 @@ cc_library(
    copts = ["-std=c++11"],
    deps = [
        "//mace/proto:cc_proto",
+        "//mace/proto:stats_proto",
+        "//mace/utils:utils",
    ],
    linkopts = if_android([
        "-llog",

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -3,6 +3,7 @@
 //
 #include "mace/core/net.h"
+#include "mace/utils/utils.h"
 namespace mace {
@@ -28,15 +29,27 @@ SimpleNet::SimpleNet(const std::shared_ptr<const NetDef>& net_def,
    }
  }
 }
-bool SimpleNet::Run() {
+bool SimpleNet::Run(RunMetadata* run_metadata) {
  VLOG(1) << "Running net " << name_;
  for (auto& op : operators_) {
    VLOG(1) << "Running operator " << op->debug_def().name() << "("
            << op->debug_def().type() << ").";
+    OperatorStats* op_stats = nullptr;
+    if (run_metadata) {
+      op_stats = run_metadata->add_op_stats();
+      op_stats->set_operator_name(op->debug_def().name());
+      op_stats->set_type(op->debug_def().type());
+      op_stats->set_all_start_micros(NowInMicroSec());
+      op_stats->set_op_start_rel_micros(NowInMicroSec() - op_stats->all_start_micros());
+    }
    if (!op->Run()) {
      LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
      return false;
    }
+    if (op_stats) {
+      op_stats->set_op_end_rel_micros(NowInMicroSec() - op_stats->all_start_micros());
+      op_stats->set_all_end_rel_micros(NowInMicroSec() - op_stats->all_start_micros());
+    }
    VLOG(1) << "Op " << op->debug_def().name()
            << " has shape: " << internal::MakeString(op->Output(0)->shape());
  }

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -9,6 +9,7 @@
 #include "mace/core/operator.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
+#include "mace/proto/stats.pb.h"
 namespace mace {
@@ -19,7 +20,7 @@ class NetBase {
          DeviceType type);
  virtual ~NetBase() noexcept {}
-  virtual bool Run() = 0;
+  virtual bool Run(RunMetadata* run_metadata = nullptr) = 0;
  const string& Name() const { return name_; }
@@ -35,7 +36,7 @@ class SimpleNet : public NetBase {
            Workspace* ws,
            DeviceType type);
-  bool Run() override;
+  bool Run(RunMetadata* run_metadata = nullptr) override;
 protected:
  vector<unique_ptr<OperatorBase> > operators_;

--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -11,7 +11,6 @@ cc_binary(
    deps = [
        "//mace/core",
        "//mace/ops",
-        "@org_tensorflow//tensorflow/core:android_tensorflow_lib",
    ],
 )

--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -101,6 +101,11 @@ int main(int argc, char **argv) {
  VLOG(0) << device_type;
  auto net = CreateNet(net_def, &ws, device_type);
+  // warm up
+  for (int i = 0; i < 2; ++i) {
+    net->Run();
+  }
  timeval tv1, tv2;
  gettimeofday(&tv1, NULL);
  for (int i = 0; i < round; ++i) {

--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
@@ -20,6 +20,16 @@ cc_proto_library(
    deps = [":proto"],
 )
+proto_library(
+    name = "stats",
+    srcs = ["stats.proto"],
+)
+cc_proto_library(
+    name = "stats_proto",
+    deps = [":stats"],
+)
 py_proto_library(
    name = "mace_py",
    srcs = ["mace.proto"],

--- a/mace/proto/stats.proto
+++ b/mace/proto/stats.proto
+syntax = "proto2";
+package mace;
+message OperatorStats {
+  optional string operator_name      = 1;
+  optional string type               = 2;
+  optional int64 all_start_micros    = 3;
+  optional int64 op_start_rel_micros = 4;
+  optional int64 op_end_rel_micros   = 5;
+  optional int64 all_end_rel_micros  = 6;
+};
+message RunMetadata {
+  repeated OperatorStats op_stats = 1;
+}
--- a/mace/tools/benchmark/BUILD
+++ b/mace/tools/benchmark/BUILD
+# Benchmark
+load("//mace:mace.bzl", "if_android")
+licenses(["notice"])  # Apache 2.0
+cc_library(
+    name = "stat_summarizer",
+    srcs = ["stat_summarizer.cc"],
+    hdrs = ["stat_summarizer.h"],
+    copts = ["-std=c++11"],
+    linkopts = ["-fopenmp"] + if_android([
+        "-ldl",
+        "-lm",
+    ]),
+    linkstatic = 1,
+    deps = [
+        "//mace/core",
+        "//mace/proto:stats_proto",
+    ],
+)
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_model.cc",
+    ],
+    copts = ["-std=c++11"],
+    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    linkstatic = 1,
+    deps = [
+        ":stat_summarizer",
+        "//mace/core",
+        "//mace/ops",
+        "//mace/utils:command_line_flags",
+    ],
+)
--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/net.h"
+#include "mace/utils/command_line_flags.h"
+#include "mace/tools/benchmark/stat_summarizer.h"
+#include "mace/utils/utils.h"
+#include <fstream>
+#include <thread>
+namespace mace {
+namespace str_util {
+std::vector<std::string> Split(const string &str, char delims) {
+  std::vector<std::string> result;
+  string tmp = str;
+  while (!tmp.empty()) {
+    result.push_back(tmp.data());
+    size_t next_offset = tmp.find(delims);
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+bool SplitAndParseToInts(const string &str, char delims, std::vector<index_t>* result) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    index_t dim = atoi(tmp.data());
+    result->push_back(dim);
+    size_t next_offset = tmp.find(delims);
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+} //  namespace str_util
+namespace benchmark {
+bool RunInference(NetBase* net, StatSummarizer* summarizer, int64_t* inference_time_us) {
+  RunMetadata run_metadata;
+  RunMetadata* run_metadata_ptr = nullptr;
+  if (summarizer) {
+    run_metadata_ptr = &run_metadata;
+  }
+  const int64_t start_time = NowInMicroSec();
+  bool s = net->Run(run_metadata_ptr);
+  const int64_t end_time = NowInMicroSec();
+  if (!s) {
+    LOG(ERROR) << "Error during inference.";
+    return s;
+  }
+  *inference_time_us = end_time - start_time;
+  if (summarizer != nullptr) {
+    summarizer->ProcessMetadata(run_metadata);
+  }
+  return true;
+}
+bool Run(NetBase* net, StatSummarizer* summarizer,
+         int num_runs, double max_time_sec, int64_t sleep_sec,
+         int64_t* total_time_us, int64_t* actual_num_runs) {
+  *total_time_us = 0;
+  LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max "
+            << max_time_sec << " seconds "
+            << (summarizer != nullptr ? "with " : "without ")
+            << "detailed stat logging, with " << sleep_sec
+            << "s sleep between inferences";
+  Stat<int64_t> stat;
+  bool util_max_time = (num_runs <= 0);
+  for (int i = 0; util_max_time || i < num_runs ; ++i) {
+    int64_t inference_time_us = 0;
+    bool s = RunInference(net, summarizer, &inference_time_us);
+    stat.UpdateStat(inference_time_us);
+    (*total_time_us) += inference_time_us;
+    ++(*actual_num_runs);
+    if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
+      break;
+    }
+    if (!s) {
+      LOG(INFO) << "Failed on run " << i;
+      return s;
+    }
+    if (sleep_sec > 0) {
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_sec));
+    }
+  }
+  std::stringstream stream;
+  stat.OutputToStream(&stream);
+  LOG(INFO) << stream.str();
+  return true;
+}
+int Main(int argc, char** argv) {
+  std::string model_file = "/data/local/tmp/mobi_mace.pb";
+  std::string device = "CPU";
+  std::string input_layer_string = "input:0";
+  std::string input_layer_shape_string = "1,224,224,3";
+  std::string input_layer_type_string = "float";
+  std::string input_layer_files_string = "";
+  std::string output_layer_string = "output:0";
+  int max_num_runs = 10;
+  std::string max_time = "10.0";
+  std::string inference_delay = "-1";
+  std::string inter_benchmark_delay = "-1";
+  int num_threads = -1;
+  std::string benchmark_name = "";
+  std::string output_prefix = "";
+  bool show_sizes = false;
+  bool show_run_order = true;
+  int run_order_limit = 0;
+  bool show_time = true;
+  int time_limit = 10;
+  bool show_memory = true;
+  int memory_limit = 10;
+  bool show_type = true;
+  bool show_summary = true;
+  bool show_flops = false;
+  int warmup_runs = 2;
+  std::vector<Flag> flag_list = {
+      Flag("model_file", &model_file, "graph file name"),
+      Flag("device", &device, "CPU/NEON"),
+      Flag("input_layer", &input_layer_string, "input layer names"),
+      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
+      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
+      Flag("input_layer_files", &input_layer_files_string,
+           "files to initialize the inputs with"),
+      Flag("output_layer", &output_layer_string, "output layer name"),
+      Flag("max_num_runs", &max_num_runs, "number of runs max"),
+      Flag("max_time", &max_time, "length to run max"),
+      Flag("inference_delay", &inference_delay,
+           "delay between runs in seconds"),
+      Flag("inter_benchmark_delay", &inter_benchmark_delay,
+           "delay between benchmarks in seconds"),
+      Flag("num_threads", &num_threads, "number of threads"),
+      Flag("benchmark_name", &benchmark_name, "benchmark name"),
+      Flag("output_prefix", &output_prefix, "benchmark output prefix"),
+      Flag("show_sizes", &show_sizes, "whether to show sizes"),
+      Flag("show_run_order", &show_run_order,
+           "whether to list stats by run order"),
+      Flag("run_order_limit", &run_order_limit,
+           "how many items to show by run order"),
+      Flag("show_time", &show_time, "whether to list stats by time taken"),
+      Flag("time_limit", &time_limit, "how many items to show by time taken"),
+      Flag("show_memory", &show_memory, "whether to list stats by memory used"),
+      Flag("memory_limit", &memory_limit,
+           "how many items to show by memory used"),
+      Flag("show_type", &show_type, "whether to list stats by op type"),
+      Flag("show_summary", &show_summary,
+           "whether to show a summary of the stats"),
+      Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
+      Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+  std::vector<std::string> input_layers = str_util::Split(input_layer_string, ',');
+  std::vector<std::string> input_layer_shapes = str_util::Split(input_layer_shape_string, ':');
+  std::vector<string> input_layer_types =
+      str_util::Split(input_layer_type_string, ',');
+  std::vector<string> input_layer_files =
+      str_util::Split(input_layer_files_string, ':');
+  std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
+  if ((input_layers.size() != input_layer_shapes.size()) ||
+      (input_layers.size() != input_layer_types.size())) {
+    LOG(ERROR) << "There must be the same number of items in --input_layer,"
+               << " --input_layer_shape, and --input_layer_type, for example"
+               << " --input_layer=input1,input2 --input_layer_type=float,float "
+               << " --input_layer_shape=1,224,224,4:1,20";
+    LOG(ERROR) << "--input_layer=" << input_layer_string << " ("
+               << input_layers.size() << " items)";
+    LOG(ERROR) << "--input_layer_type=" << input_layer_type_string << " ("
+               << input_layer_types.size() << " items)";
+    LOG(ERROR) << "--input_layer_shape=" << input_layer_shape_string << " ("
+               << input_layer_shapes.size() << " items)";
+    return -1;
+  }
+  const size_t inputs_count = input_layers.size();
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return -1;
+  }
+  LOG(INFO) << "Model file: [" << model_file << "]";
+  LOG(INFO) << "Device: [" << device << "]";
+  LOG(INFO) << "Input layers: [" << input_layer_string << "]";
+  LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
+  LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
+  LOG(INFO) << "Output layers: [" << output_layer_string << "]";
+  LOG(INFO) << "Num runs: [" << max_num_runs << "]";
+  LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
+  LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
+            << "]";
+  LOG(INFO) << "Num threads: [" << num_threads << "]";
+  LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
+  LOG(INFO) << "Output prefix: [" << output_prefix << "]";
+  LOG(INFO) << "Show sizes: [" << show_sizes << "]";
+  LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
+  const long int inter_inference_sleep_seconds =
+      std::strtol(inference_delay.c_str(), nullptr, 10);
+  const long int inter_benchmark_sleep_seconds =
+      std::strtol(inter_benchmark_delay.c_str(), nullptr, 10);
+  const double max_benchmark_time_seconds =
+      std::strtod(max_time.c_str(), nullptr);
+  std::unique_ptr<StatSummarizer> stats;
+  StatSummarizerOptions stats_options;
+  stats_options.show_run_order = show_run_order;
+  stats_options.run_order_limit = run_order_limit;
+  stats_options.show_time = show_time;
+  stats_options.time_limit = time_limit;
+  stats_options.show_memory = show_memory;
+  stats_options.memory_limit = memory_limit;
+  stats_options.show_type = show_type;
+  stats_options.show_summary = show_summary;
+  stats.reset(new StatSummarizer(stats_options));
+  // load model
+  std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
+  if (!model_file_stream.is_open()) {
+    LOG(ERROR) << "model file open failed";
+    return -1;
+  }
+  NetDef net_def;
+  net_def.ParseFromIstream(&model_file_stream);
+  model_file_stream.close();
+  Workspace ws;
+  ws.LoadModelTensor(net_def, DeviceType::CPU);
+  // Load inputs
+  for (size_t i = 0; i < inputs_count; ++i) {
+    Tensor *input_tensor = ws.CreateTensor(input_layers[i],
+                                           cpu_allocator(), DT_FLOAT);
+    vector<index_t> shapes;
+    str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes);
+    input_tensor->Resize(shapes);
+    float *input_data = input_tensor->mutable_data<float>();
+    // load input
+    if (i < input_layer_files.size()) {
+      std::ifstream in_file(input_layer_files[i], std::ios::in | std::ios::binary);
+      in_file.read(reinterpret_cast<char *>(input_data),
+                   input_tensor->size() * sizeof(float));
+      in_file.close();
+    }
+  }
+  // create net
+  DeviceType device_type;
+  DeviceType_Parse(device, &device_type);
+  auto net = CreateNet(net_def, &ws, device_type);
+  int64_t warmup_time_us = 0;
+  int64_t num_warmup_runs = 0;
+  if (warmup_runs > 0) {
+    bool status = Run(net.get(), nullptr,
+                      warmup_runs, -1.0, inter_inference_sleep_seconds,
+                      &warmup_time_us, &num_warmup_runs);
+    if (!status) {
+      LOG(ERROR) << "Failed at warm up run";
+    }
+  }
+  if (inter_benchmark_sleep_seconds > 0) {
+    std::this_thread::sleep_for(std::chrono::seconds(inter_benchmark_sleep_seconds));
+  }
+  int64_t no_stat_time_us = 0;
+  int64_t no_stat_runs = 0;
+  bool status = Run(net.get(), nullptr,
+                    max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds,
+                    &no_stat_time_us, &no_stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal no-stat run";
+  }
+  int64_t stat_time_us = 0;
+  int64_t stat_runs = 0;
+  status = Run(net.get(), stats.get(),
+               max_num_runs, max_benchmark_time_seconds, inter_inference_sleep_seconds,
+               &stat_time_us, &stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal stat run";
+  }
+  LOG(INFO) << "Average inference timings in us: "
+            << "Warmup: "
+            << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
+            << "no stats: " << no_stat_time_us / no_stat_runs << ", "
+            << "with stats: " << stat_time_us / stat_runs;
+  stats->PrintOperatorStats();
+  return 0;
+}
+} //  namespace benchmark
+} //  namespace mace
+int main (int argc, char** argv) {
+  mace::benchmark::Main(argc, argv);
+}
--- a/mace/tools/benchmark/stat_summarizer.cc
+++ b/mace/tools/benchmark/stat_summarizer.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/common.h"
+#include "mace/tools/benchmark/stat_summarizer.h"
+#include "mace/proto/stats.pb.h"
+#include <iomanip>
+#include <queue>
+namespace mace {
+StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
+    : options_(options) {}
+StatSummarizer::~StatSummarizer() {}
+void StatSummarizer::Reset() {
+  run_total_us_.Reset();
+  memory_.Reset();
+  details_.clear();
+}
+void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
+  int64_t curr_total_us = 0;
+  int64_t mem_total = 0;
+  int64_t first_node_start_us =
+      run_metadata.op_stats(0).all_start_micros();
+  int node_num = 0;
+  for (const auto& ops : run_metadata.op_stats()) {
+    std::string name = ops.operator_name();
+    std::string op_type = ops.type();
+    ++node_num;
+    const int64_t curr_time = ops.all_end_rel_micros();
+    curr_total_us += curr_time;
+    auto result = details_.emplace(name, Detail());
+    Detail* detail = &(result.first->second);
+    detail->start_us.UpdateStat(ops.all_start_micros() - first_node_start_us);
+    detail->rel_end_us.UpdateStat(curr_time);
+    // If this is the first pass, initialize some values.
+    if (result.second) {
+      detail->name = name;
+      detail->type = op_type;
+      detail->run_order = node_num;
+      detail->times_called = 0;
+    }
+    ++detail->times_called;
+  }
+  run_total_us_.UpdateStat(curr_total_us);
+  memory_.UpdateStat(mem_total);
+}
+std::string StatSummarizer::ShortSummary() const {
+  std::stringstream stream;
+  stream << "Timings (microseconds): ";
+  run_total_us_.OutputToStream(&stream);
+  stream << std::endl;
+  stream << "Memory (bytes): ";
+  memory_.OutputToStream(&stream);
+  stream << std::endl;
+  stream << details_.size() << " nodes observed" << std::endl;
+  return stream.str();
+}
+std::ostream& InitField(std::ostream& stream, int width) {
+  stream << "\t" << std::right << std::setw(width) << std::fixed
+         << std::setprecision(3);
+  return stream;
+}
+std::string StatSummarizer::HeaderString(const std::string& title) const {
+  std::stringstream stream;
+  stream << "============================== " << title
+         << " ==============================" << std::endl;
+  InitField(stream, 24) << "[node type]";
+  InitField(stream, 9) << "[start]";
+  InitField(stream, 9) << "[first]";
+  InitField(stream, 9) << "[avg ms]";
+  InitField(stream, 8) << "[%]";
+  InitField(stream, 8) << "[cdf%]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 9) << "[times called]";
+  stream << "\t"
+         << "[Name]";
+  return stream.str();
+}
+std::string StatSummarizer::ColumnString(const StatSummarizer::Detail& detail,
+                                         const int64_t cumulative_stat_on_node,
+                                         const Stat<int64_t>& stat) const {
+  const double start_ms = detail.start_us.avg() / 1000.0;
+  const double first_time_ms = detail.rel_end_us.first() / 1000.0;
+  const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
+  const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
+  const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
+  const int64_t times_called = detail.times_called / num_runs();
+  std::stringstream stream;
+  InitField(stream, 24) << detail.type;
+  InitField(stream, 9) << start_ms;
+  InitField(stream, 9) << first_time_ms;
+  InitField(stream, 9) << avg_time_ms;
+  InitField(stream, 7) << percentage << "%";
+  InitField(stream, 7) << cdf_percentage << "%";
+  InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
+  InitField(stream, 9) << times_called;
+  stream << "\t" << detail.name;
+  return stream.str();
+}
+void StatSummarizer::OrderNodesByMetric(
+    SortingMetric metric, std::vector<const Detail*>* details) const {
+  std::priority_queue<std::pair<std::string, const Detail*>> sorted_list;
+  const int num_nodes = details_.size();
+  for (const auto& det : details_) {
+    const Detail* detail = &(det.second);
+    std::stringstream stream;
+    stream << std::setw(20) << std::right << std::setprecision(10)
+           << std::fixed;
+    switch (metric) {
+      case BY_NAME:
+        stream << detail->name;
+        break;
+      case BY_RUN_ORDER:
+        stream << num_nodes - detail->run_order;
+        break;
+      case BY_TIME:
+        stream << detail->rel_end_us.avg();
+        break;
+      case BY_MEMORY:
+        stream << detail->mem_used.avg();
+        break;
+      case BY_TYPE:
+        stream << detail->type;
+        break;
+      default:
+        stream << "";
+        break;
+    }
+    sorted_list.emplace(stream.str(), detail);
+  }
+  while (!sorted_list.empty()) {
+    auto entry = sorted_list.top();
+    sorted_list.pop();
+    details->push_back(entry.second);
+  }
+}
+void StatSummarizer::ComputeStatsByType(
+    std::map<std::string, int64_t>* node_type_map_count,
+    std::map<std::string, int64_t>* node_type_map_time,
+    std::map<std::string, int64_t>* node_type_map_memory,
+    std::map<std::string, int64_t>* node_type_map_times_called,
+    int64_t* accumulated_us) const {
+  int64_t run_count = run_total_us_.count();
+  for (const auto& det : details_) {
+    const std::string node_name = det.first;
+    const Detail& detail = det.second;
+    int64_t curr_time_val =
+        static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
+    *accumulated_us += curr_time_val;
+    int64_t curr_memory_val = detail.mem_used.newest();
+    const std::string& node_type = detail.type;
+    (*node_type_map_count)[node_type] += 1;
+    (*node_type_map_time)[node_type] += curr_time_val;
+    (*node_type_map_memory)[node_type] += curr_memory_val;
+    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
+  }
+}
+std::string StatSummarizer::GetStatsByNodeType() const {
+  std::stringstream stream;
+  stream << "============================== Summary by node type "
+            "=============================="
+         << std::endl;
+  LOG(INFO) << "Number of nodes executed: " << details_.size();
+  std::map<std::string, int64_t> node_type_map_count;
+  std::map<std::string, int64_t> node_type_map_time;
+  std::map<std::string, int64_t> node_type_map_memory;
+  std::map<std::string, int64_t> node_type_map_times_called;
+  int64_t accumulated_us = 0;
+  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                     &node_type_map_memory, &node_type_map_times_called,
+                     &accumulated_us);
+  // Sort them.
+  std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>> timings;
+  for (const auto& node_type : node_type_map_time) {
+    const int64_t mem_used = node_type_map_memory[node_type.first];
+    timings.emplace(node_type.second,
+                    std::pair<std::string, int64_t>(node_type.first, mem_used));
+  }
+  InitField(stream, 24) << "[Node type]";
+  InitField(stream, 9) << "[count]";
+  InitField(stream, 10) << "[avg ms]";
+  InitField(stream, 11) << "[avg %]";
+  InitField(stream, 11) << "[cdf %]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 10) << "[times called]";
+  stream << std::endl;
+  float cdf = 0.0f;
+  while (!timings.empty()) {
+    auto entry = timings.top();
+    timings.pop();
+    const std::string node_type = entry.second.first;
+    const float memory = entry.second.second / 1000.0f;
+    const int64_t node_type_total_us = entry.first;
+    const float time_per_run_ms = node_type_total_us / 1000.0f;
+    const float percentage =
+        ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
+    cdf += percentage;
+    InitField(stream, 24) << node_type;
+    InitField(stream, 9) << node_type_map_count[node_type];
+    InitField(stream, 10) << time_per_run_ms;
+    InitField(stream, 10) << percentage << "%";
+    InitField(stream, 10) << cdf << "%";
+    InitField(stream, 10) << memory;
+    InitField(stream, 9) << node_type_map_times_called[node_type];
+    stream << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+std::string StatSummarizer::GetStatsByMetric(const std::string& title,
+                                             SortingMetric sorting_metric,
+                                             int num_stats) const {
+  std::vector<const Detail*> details;
+  OrderNodesByMetric(sorting_metric, &details);
+  double cumulative_stat_on_node = 0;
+  std::stringstream stream;
+  stream << HeaderString(title) << std::endl;
+  int stat_num = 0;
+  for (auto detail : details) {
+    ++stat_num;
+    if (num_stats > 0 && stat_num > num_stats) {
+      break;
+    }
+    cumulative_stat_on_node += detail->rel_end_us.sum();
+    stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
+           << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+std::string StatSummarizer::GetOutputString() const {
+  std::stringstream stream;
+  if (options_.show_run_order) {
+    stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
+                               options_.run_order_limit);
+  }
+  if (options_.show_time) {
+    stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
+                               options_.time_limit);
+  }
+  if (options_.show_memory) {
+    stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
+                               options_.memory_limit);
+  }
+  if (options_.show_type) {
+    stream << GetStatsByNodeType();
+  }
+  if (options_.show_summary) {
+    stream << ShortSummary() << std::endl;
+  }
+  return stream.str();
+}
+void StatSummarizer::PrintOperatorStats() const {
+  std::string output = GetOutputString();
+  std::istringstream iss(output);
+  for (std::string line; std::getline(iss, line);) {
+    LOG(INFO) << line;
+  }
+}
+}  // namespace mace
--- a/mace/tools/benchmark/stat_summarizer.h
+++ b/mace/tools/benchmark/stat_summarizer.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+#include <stdlib.h>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+namespace mace {
+class RunMetadata;
+template <typename ValueType, typename HighPrecisionValueType = double>
+class Stat {
+ public:
+  void UpdateStat(ValueType v) {
+    if (count_ == 0) {
+      first_ = v;
+    }
+    newest_ = v;
+    max_ = std::max(v, max_);
+    min_ = std::min(v, min_);
+    ++count_;
+    sum_ += v;
+    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
+  }
+  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
+  bool empty() const { return count_ == 0; }
+  ValueType first() const { return first_; }
+  ValueType newest() const { return newest_; }
+  ValueType max() const { return max_; }
+  ValueType min() const { return min_; }
+  int64_t count() const { return count_; }
+  ValueType sum() const { return sum_; }
+  HighPrecisionValueType squared_sum() const { return squared_sum_; }
+  bool all_same() const { return (count_ == 0 || min_ == max_); }
+  HighPrecisionValueType avg() const {
+    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
+                   : static_cast<HighPrecisionValueType>(sum_) / count_;
+  }
+  ValueType std_deviation() const {
+    return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg());
+  }
+  void OutputToStream(std::ostream* stream) const {
+    if (empty()) {
+      *stream << "count=0";
+    } else if (all_same()) {
+      *stream << "count=" << count_ << " curr=" << newest_;
+      if (count_ > 1) *stream << "(all same)";
+    } else {
+      *stream << "count=" << count_ << " first=" << first_
+              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
+              << " avg=" << avg() << " std=" << std_deviation();
+    }
+  }
+  friend std::ostream& operator<<(std::ostream& stream,
+                                  const Stat<ValueType>& stat) {
+    stat.OutputToStream(&stream);
+    return stream;
+  }
+ private:
+  ValueType first_ = 0;
+  ValueType newest_ = 0;
+  ValueType max_ = std::numeric_limits<ValueType>::min();
+  ValueType min_ = std::numeric_limits<ValueType>::max();
+  int64_t count_ = 0;
+  ValueType sum_ = 0;
+  HighPrecisionValueType squared_sum_ = 0;
+};
+// Used to control the output of the statistics summarizer;
+class StatSummarizerOptions {
+ public:
+  StatSummarizerOptions()
+      : show_run_order(true),
+        run_order_limit(0),
+        show_time(true),
+        time_limit(10),
+        show_memory(true),
+        memory_limit(10),
+        show_type(true),
+        show_summary(true) {}
+  bool show_run_order;
+  int run_order_limit;
+  bool show_time;
+  int time_limit;
+  bool show_memory;
+  int memory_limit;
+  bool show_type;
+  bool show_summary;
+};
+// A StatSummarizer assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc. across
+// multiple executions of a single Graph from the StepStats collected during
+// graph execution.
+//
+// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
+class StatSummarizer {
+ public:
+  enum SortingMetric {
+    BY_NAME,
+    BY_RUN_ORDER,
+    BY_TIME,
+    BY_MEMORY,
+    BY_TYPE,
+  };
+  explicit StatSummarizer(const StatSummarizerOptions& options);
+  ~StatSummarizer();
+  // Adds another run's StepStats output to the aggregate counts.
+  void ProcessMetadata(const RunMetadata& run_metadata);
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const;
+  std::string ShortSummary() const;
+  // Prints the string returned by GetOutputString().
+  void PrintOperatorStats() const;
+  void ComputeStatsByType(std::map<std::string, int64_t>* node_type_map_count,
+                          std::map<std::string, int64_t>* node_type_map_time,
+                          std::map<std::string, int64_t>* node_type_map_memory,
+                          std::map<std::string, int64_t>* node_type_map_times_called,
+                          int64_t* accumulated_us) const;
+  std::string GetStatsByNodeType() const;
+  std::string GetStatsByMetric(const std::string& title,
+                               SortingMetric sorting_metric,
+                               int num_stats) const;
+  void Reset();
+  // Returns number of runs.
+  int num_runs() const { return run_total_us_.count(); }
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t>& run_total_us() const { return run_total_us_; }
+ private:
+  struct Detail {
+    std::string name;
+    std::string type;
+    int64_t run_order;
+    Stat<int64_t> start_us;
+    Stat<int64_t> rel_end_us;
+    Stat<int64_t> mem_used;
+    int64_t times_called;
+  };
+  void OrderNodesByMetric(SortingMetric sorting_metric,
+                          std::vector<const Detail*>* details) const;
+  std::string HeaderString(const std::string& title) const;
+  std::string ColumnString(const Detail& detail,
+                           const int64_t cumulative_stat_on_node,
+                           const Stat<int64_t>& stat) const;
+  Stat<int64_t> run_total_us_;
+  Stat<int64_t> memory_;
+  std::map<std::string, Detail> details_;
+  StatSummarizerOptions options_;
+};
+} //  namespace mace
+#endif  // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -4,6 +4,9 @@
 #ifndef MACE_UTILS_UTILS_H_
 #define MACE_UTILS_UTILS_H_
+#include <sys/time.h>
 namespace mace {
 template <typename Integer>
 Integer RoundUp(Integer i, Integer factor) {
@@ -14,5 +17,12 @@ template <typename Integer>
 Integer CeilQuotient(Integer a, Integer b) {
  return (a + b - 1) / b;
 }
+inline int64_t NowInMicroSec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
+}
 } //  namespace mace
 #endif //  MACE_UTILS_UTILS_H_