diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..41d9067871bf59b2f2bc3590147b2868226b1944
--- /dev/null
+++ b/mace/benchmark/BUILD
@@ -0,0 +1,66 @@
+# Benchmark
+# Examples
+load(
+    "//:mace.bzl",
+    "if_production_mode",
+    "if_not_production_mode",
+    "if_hexagon_enabled",
+    "if_openmp_enabled",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "stat_summarizer",
+    srcs = ["stat_summarizer.cc"],
+    hdrs = ["stat_summarizer.h"],
+    linkstatic = 1,
+    deps = [
+        "@mace//:mace_headers",
+    ],
+)
+
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_model.cc",
+    ],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        ":stat_summarizer",
+        "//codegen:generated_models",
+        "//external:gflags_nothreads",
+    ] + if_hexagon_enabled([
+        "//lib/hexagon:hexagon",
+    ]) + if_production_mode([
+        "@mace//:mace_prod",
+        "//codegen:generated_opencl_prod",
+        "//codegen:generated_tuning_params",
+    ]) + if_not_production_mode([
+        "@mace//:mace_dev",
+    ]),
+)
+
+cc_library(
+    name = "libmace_merged",
+    srcs = [
+        "libmace_merged.a",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_binary(
+    name = "model_throughput_test",
+    srcs = ["model_throughput_test.cc"],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        ":libmace_merged",
+        "//external:gflags_nothreads",
+        "//lib/hexagon",
+        "@mace//:mace",
+        "@mace//:mace_headers",
+        "@mace//:mace_prod",
+    ],
+)
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26bfeb618bdeff76c56e9ceb2cb33930a43dc461
--- /dev/null
+++ b/mace/benchmark/benchmark_model.cc
@@ -0,0 +1,362 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+#include "benchmark/stat_summarizer.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <thread>
+#include <numeric>
+#include <sys/time.h>
+
+namespace mace {
+namespace MACE_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}
+}
+
+namespace mace {
+namespace str_util {
+
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+
+bool SplitAndParseToInts(const std::string &str,
+                         char delims,
+                         std::vector<int64_t> *result) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int64_t dim = atoi(tmp.data());
+    result->push_back(dim);
+    size_t next_offset = tmp.find(delims);
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return true;
+}
+
+}  //  namespace str_util
+
+namespace benchmark {
+
+void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!::isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+
+inline int64_t NowMicros() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+
+bool RunInference(MaceEngine *engine,
+                  const std::vector<mace::MaceInputInfo> &input_infos,
+                  std::map<std::string, float*> &output_infos,
+                  StatSummarizer *summarizer,
+                  int64_t *inference_time_us) {
+  RunMetadata run_metadata;
+  RunMetadata *run_metadata_ptr = nullptr;
+  if (summarizer) {
+    run_metadata_ptr = &run_metadata;
+  }
+  if (input_infos.size() == 1 && output_infos.size() == 1) {
+    const int64_t start_time = NowMicros();
+    bool s = engine->Run(input_infos[0].data, input_infos[0].shape,
+                         output_infos.begin()->second, run_metadata_ptr);
+    const int64_t end_time = NowMicros();
+
+    if (!s) {
+      LOG(ERROR) << "Error during inference.";
+      return s;
+    }
+    *inference_time_us = end_time - start_time;
+  } else {
+    const int64_t start_time = NowMicros();
+    bool s = engine->Run(input_infos, output_infos, run_metadata_ptr);
+    const int64_t end_time = NowMicros();
+
+    if (!s) {
+      LOG(ERROR) << "Error during inference.";
+      return s;
+    }
+    *inference_time_us = end_time - start_time;
+  }
+
+  if (summarizer != nullptr) {
+    summarizer->ProcessMetadata(run_metadata);
+  }
+
+  return true;
+}
+
+bool Run(MaceEngine *engine,
+         const std::vector<mace::MaceInputInfo> &input_infos,
+         std::map<std::string, float*> &output_infos,
+         StatSummarizer *summarizer,
+         int num_runs,
+         double max_time_sec,
+         int64_t sleep_sec,
+         int64_t *total_time_us,
+         int64_t *actual_num_runs) {
+  *total_time_us = 0;
+
+  LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max "
+            << max_time_sec << " seconds "
+            << (summarizer != nullptr ? "with " : "without ")
+            << "detailed stat logging, with " << sleep_sec
+            << "s sleep between inferences";
+
+  Stat<int64_t> stat;
+
+  bool util_max_time = (num_runs <= 0);
+  for (int i = 0; util_max_time || i < num_runs; ++i) {
+    int64_t inference_time_us = 0;
+    bool s = RunInference(engine, input_infos, output_infos, summarizer, &inference_time_us);
+    stat.UpdateStat(inference_time_us);
+    (*total_time_us) += inference_time_us;
+    ++(*actual_num_runs);
+
+    if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
+      break;
+    }
+
+    if (!s) {
+      LOG(INFO) << "Failed on run " << i;
+      return s;
+    }
+
+    if (sleep_sec > 0) {
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_sec));
+    }
+  }
+
+  std::stringstream stream;
+  stat.OutputToStream(&stream);
+  LOG(INFO) << stream.str();
+
+  return true;
+}
+
+DEFINE_string(device, "CPU", "Device [CPU|OPENCL]");
+DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma");
+DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma");
+DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
+DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
+DEFINE_string(input_file, "", "input file name");
+DEFINE_int32(max_num_runs, 100, "number of runs max");
+DEFINE_string(max_time, "10.0", "length to run max");
+DEFINE_string(inference_delay, "-1", "delay between runs in seconds");
+DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds");
+DEFINE_string(benchmark_name, "", "benchmark name");
+DEFINE_bool(show_run_order, true, "whether to list stats by run order");
+DEFINE_int32(run_order_limit, 0, "how many items to show by run order");
+DEFINE_bool(show_time, true, "whether to list stats by time taken");
+DEFINE_int32(time_limit, 10, "how many items to show by time taken");
+DEFINE_bool(show_memory, false, "whether to list stats by memory used");
+DEFINE_int32(memory_limit, 10, "how many items to show by memory used");
+DEFINE_bool(show_type, true, "whether to list stats by op type");
+DEFINE_bool(show_summary, true, "whether to show a summary of the stats");
+DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs");
+DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
+DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0");
+
+int Main(int argc, char **argv) {
+  MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP.");
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]";
+  LOG(INFO) << "Device: [" << FLAGS_device << "]";
+  LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
+  LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]";
+  LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
+  LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]";
+  LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]";
+  LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]";
+  LOG(INFO) << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]";
+  LOG(INFO) << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]";
+
+  const long int inter_inference_sleep_seconds =
+      std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10);
+  const long int inter_benchmark_sleep_seconds =
+      std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10);
+  const double max_benchmark_time_seconds =
+      std::strtod(FLAGS_max_time.c_str(), nullptr);
+
+  std::unique_ptr<StatSummarizer> stats;
+
+  StatSummarizerOptions stats_options;
+  stats_options.show_run_order = FLAGS_show_run_order;
+  stats_options.run_order_limit = FLAGS_run_order_limit;
+  stats_options.show_time = FLAGS_show_time;
+  stats_options.time_limit = FLAGS_time_limit;
+  stats_options.show_memory = FLAGS_show_memory;
+  stats_options.memory_limit = FLAGS_memory_limit;
+  stats_options.show_type = FLAGS_show_type;
+  stats_options.show_summary = FLAGS_show_summary;
+  stats.reset(new StatSummarizer(stats_options));
+
+  DeviceType device_type = CPU;
+  if(FLAGS_device == "OPENCL") {
+    device_type = OPENCL;
+  }
+
+  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = str_util::Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = str_util::Split(FLAGS_output_shape, ':');
+
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
+  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+
+  const unsigned char *model_data =
+      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+
+  std::vector<mace::MaceInputInfo> input_infos(input_count);
+  std::map<std::string, float*> output_infos;
+  std::vector<std::unique_ptr<float[]>> input_datas(input_count);
+  std::vector<std::unique_ptr<float[]>> output_datas(output_count);
+
+  for (size_t i = 0; i < input_count; ++i) {
+    int64_t input_size = std::accumulate(input_shape_vec[i].begin(),
+                                         input_shape_vec[i].end(), 1, std::multiplies<int64_t>());
+    input_datas[i].reset(new float[input_size]);
+    // load input
+    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary);
+    if (in_file.is_open()) {
+      in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
+                   input_size * sizeof(float));
+      in_file.close();
+    } else {
+      LOG(INFO) << "Open input file failed";
+      return -1;
+    }
+
+    input_infos[i].name = input_names[i];
+    input_infos[i].shape = input_shape_vec[i];
+    input_infos[i].data = input_datas[i].get();
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    int64_t output_size = std::accumulate(output_shape_vec[i].begin(),
+                                          output_shape_vec[i].end(), 1, std::multiplies<int64_t>());
+    output_datas[i].reset(new float[output_size]);
+    output_infos[output_names[i]] = output_datas[i].get();
+  }
+
+  // Init model
+  LOG(INFO) << "Run init";
+  std::unique_ptr<mace::MaceEngine> engine_ptr;
+  if (input_count == 1 && output_count == 1) {
+    engine_ptr.reset(new mace::MaceEngine(&net_def, device_type));
+  } else {
+    engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, input_names, output_names));
+  }
+  if (device_type == DeviceType::OPENCL) {
+    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+  }
+
+  LOG(INFO) << "Warm up";
+
+  int64_t warmup_time_us = 0;
+  int64_t num_warmup_runs = 0;
+  if (FLAGS_warmup_runs > 0) {
+    bool status =
+        Run(engine_ptr.get(), input_infos, output_infos, nullptr, FLAGS_warmup_runs, -1.0,
+            inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs);
+    if (!status) {
+      LOG(ERROR) << "Failed at warm up run";
+    }
+  }
+
+  if (inter_benchmark_sleep_seconds > 0) {
+    std::this_thread::sleep_for(
+        std::chrono::seconds(inter_benchmark_sleep_seconds));
+  }
+  int64_t no_stat_time_us = 0;
+  int64_t no_stat_runs = 0;
+  bool status =
+      Run(engine_ptr.get(), input_infos, output_infos,
+          nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds,
+          inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal no-stat run";
+  }
+
+  int64_t stat_time_us = 0;
+  int64_t stat_runs = 0;
+  status = Run(engine_ptr.get(), input_infos, output_infos,
+               stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds,
+               inter_inference_sleep_seconds, &stat_time_us, &stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal stat run";
+  }
+
+  LOG(INFO) << "Average inference timings in us: "
+            << "Warmup: "
+            << (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", "
+            << "no stats: " << no_stat_time_us / no_stat_runs << ", "
+            << "with stats: " << stat_time_us / stat_runs;
+
+  stats->PrintOperatorStats();
+
+  return 0;
+}
+
+}  //  namespace benchmark
+}  //  namespace mace
+
+int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48e622cb7b79b026b89ad6eb1fb4349056b2d9e8
--- /dev/null
+++ b/mace/benchmark/model_throughput_test.cc
@@ -0,0 +1,281 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+/**
+ * Usage:
+ * throughput_test \
+ *          --input_shape=1,224,224,3   \
+ *          --output_shape=1,224,224,2   \
+ *          --input_file=input_data \
+ *          --cpu_model_data_file=cpu_model_data.data \
+ *          --gpu_model_data_file=gpu_model_data.data \
+ *          --dsp_model_data_file=dsp_model_data.data \
+ *          --run_seconds=10
+ */
+#include <malloc.h>
+#include <stdint.h>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+using namespace std;
+using namespace mace;
+
+namespace mace {
+
+#ifdef MACE_CPU_MODEL_TAG
+namespace MACE_CPU_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_CPU_MODEL_TAG
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+namespace MACE_GPU_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_GPU_MODEL_TAG
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+namespace MACE_DSP_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_DSP_MODEL_TAG
+#endif
+
+}  // namespace mace
+
+void ParseShape(const string &str, vector<int64_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+DeviceType ParseDeviceType(const string &device_str) {
+  if (device_str.compare("CPU") == 0) {
+    return DeviceType::CPU;
+  } else if (device_str.compare("NEON") == 0) {
+    return DeviceType::NEON;
+  } else if (device_str.compare("OPENCL") == 0) {
+    return DeviceType::OPENCL;
+  } else if (device_str.compare("HEXAGON") == 0) {
+    return DeviceType::HEXAGON;
+  } else {
+    return DeviceType::CPU;
+  }
+}
+
+DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
+DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
+DEFINE_string(input_file, "", "input file name");
+DEFINE_string(cpu_model_data_file, "", "cpu model data file name");
+DEFINE_string(gpu_model_data_file, "", "gpu model data file name");
+DEFINE_string(dsp_model_data_file, "", "dsp model data file name");
+DEFINE_int32(run_seconds, 10, "run seconds");
+
+int main(int argc, char **argv) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "mace version: " << MaceVersion();
+  LOG(INFO) << "mace git version: " << MaceGitVersion();
+#ifdef MACE_CPU_MODEL_TAG
+  LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum();
+#endif
+#ifdef MACE_GPU_MODEL_TAG
+  LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum();
+#endif
+#ifdef MACE_DSP_MODEL_TAG
+  LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum();
+#endif
+  LOG(INFO) << "input_shape: " << FLAGS_input_shape;
+  LOG(INFO) << "output_shape: " << FLAGS_output_shape;
+  LOG(INFO) << "input_file: " << FLAGS_input_file;
+  LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
+  LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file;
+  LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
+  LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
+
+  vector<int64_t> input_shape_vec;
+  vector<int64_t> output_shape_vec;
+  ParseShape(FLAGS_input_shape, &input_shape_vec);
+  ParseShape(FLAGS_output_shape, &output_shape_vec);
+
+  int64_t input_size =
+      std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1,
+                      std::multiplies<int64_t>());
+  int64_t output_size =
+      std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1,
+                      std::multiplies<int64_t>());
+  std::unique_ptr<float[]> input_data(new float[input_size]);
+  std::unique_ptr<float[]> cpu_output_data(new float[output_size]);
+  std::unique_ptr<float[]> gpu_output_data(new float[output_size]);
+  std::unique_ptr<float[]> dsp_output_data(new float[output_size]);
+
+  // load input
+  ifstream in_file(FLAGS_input_file, ios::in | ios::binary);
+  if (in_file.is_open()) {
+    in_file.read(reinterpret_cast<char *>(input_data.get()),
+                 input_size * sizeof(float));
+    in_file.close();
+  } else {
+    LOG(INFO) << "Open input file failed";
+    return -1;
+  }
+
+  int64_t t0, t1, init_micros;
+#ifdef MACE_CPU_MODEL_TAG
+  /* --------------------- CPU init ----------------------- */
+  LOG(INFO) << "Load & init cpu model and warm up";
+  const unsigned char *cpu_model_data =
+      mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str());
+  NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
+
+  mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU);
+
+  LOG(INFO) << "CPU Warm up run";
+  t0 = NowMicros();
+  cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+  /* --------------------- GPU init ----------------------- */
+  LOG(INFO) << "Load & init gpu model and warm up";
+  const unsigned char *gpu_model_data =
+      mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
+  NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
+
+  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL);
+  mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
+
+  LOG(INFO) << "GPU Warm up run";
+  t0 = NowMicros();
+  gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+  /* --------------------- DSP init ----------------------- */
+  LOG(INFO) << "Load & init dsp model and warm up";
+  const unsigned char *dsp_model_data =
+      mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
+  NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
+
+  mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON);
+  mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
+
+  LOG(INFO) << "DSP Warm up run";
+  t0 = NowMicros();
+  gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+  double cpu_throughput = 0;
+  double gpu_throughput = 0;
+  double dsp_throughput = 0;
+  int64_t run_micros = FLAGS_run_seconds * 1000000;
+
+#ifdef MACE_CPU_MODEL_TAG
+  std::thread cpu_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    cpu_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+  std::thread gpu_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    gpu_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+  std::thread dsp_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    dsp_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+  double total_throughput = 0;
+
+#ifdef MACE_CPU_MODEL_TAG
+  cpu_thread.join();
+  LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s";
+  total_throughput += cpu_throughput;
+#endif
+#ifdef MACE_GPU_MODEL_TAG
+  gpu_thread.join();
+  LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s";
+  total_throughput += gpu_throughput;
+#endif
+#ifdef MACE_DSP_MODEL_TAG
+  dsp_thread.join();
+  LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s";
+  total_throughput += dsp_throughput;
+#endif
+
+  LOG(INFO) << "Total throughput: " << total_throughput << " f/s";
+}
diff --git a/mace/benchmark/stat_summarizer.cc b/mace/benchmark/stat_summarizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68801fb26aef120b74a23cf561056f9f57398c6f
--- /dev/null
+++ b/mace/benchmark/stat_summarizer.cc
@@ -0,0 +1,320 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "benchmark/stat_summarizer.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+
+#include <iomanip>
+#include <queue>
+#include <iostream>
+
+namespace mace {
+
+StatSummarizer::StatSummarizer(const StatSummarizerOptions &options)
+    : options_(options) {}
+
+StatSummarizer::~StatSummarizer() {}
+
+void StatSummarizer::Reset() {
+  run_total_us_.Reset();
+  memory_.Reset();
+  details_.clear();
+}
+
+void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
+  int64_t curr_total_us = 0;
+  int64_t mem_total = 0;
+
+  if (run_metadata.op_stats.empty()) {
+    std::cerr << "Runtime op stats should not be empty" << std::endl;
+    abort();
+  }
+  int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
+
+  int node_num = 0;
+  for (const auto &ops : run_metadata.op_stats) {
+    std::string name = ops.operator_name;
+    std::string op_type = ops.type;
+
+    ++node_num;
+    const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
+    curr_total_us += curr_time;
+    auto result = details_.emplace(name, Detail());
+    Detail *detail = &(result.first->second);
+
+    detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
+    detail->rel_end_us.UpdateStat(curr_time);
+
+    // If this is the first pass, initialize some values.
+    if (result.second) {
+      detail->name = name;
+      detail->type = op_type;
+
+      detail->run_order = node_num;
+
+      detail->times_called = 0;
+    }
+
+    ++detail->times_called;
+  }
+
+  run_total_us_.UpdateStat(curr_total_us);
+  memory_.UpdateStat(mem_total);
+}
+
+std::string StatSummarizer::ShortSummary() const {
+  std::stringstream stream;
+  stream << "Timings (microseconds): ";
+  run_total_us_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << "Memory (bytes): ";
+  memory_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << details_.size() << " nodes observed" << std::endl;
+  return stream.str();
+}
+
+std::ostream &InitField(std::ostream &stream, int width) {
+  stream << "\t" << std::right << std::setw(width) << std::fixed
+         << std::setprecision(3);
+  return stream;
+}
+
+std::string StatSummarizer::HeaderString(const std::string &title) const {
+  std::stringstream stream;
+
+  stream << "============================== " << title
+         << " ==============================" << std::endl;
+
+  InitField(stream, 14) << "[node type]";
+  InitField(stream, 9) << "[start]";
+  InitField(stream, 9) << "[first]";
+  InitField(stream, 9) << "[avg ms]";
+  InitField(stream, 8) << "[%]";
+  InitField(stream, 8) << "[cdf%]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 9) << "[times called]";
+  stream << "\t"
+         << "[Name]";
+  return stream.str();
+}
+
+std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail,
+                                         const int64_t cumulative_stat_on_node,
+                                         const Stat<int64_t> &stat) const {
+  const double start_ms = detail.start_us.avg() / 1000.0;
+  const double first_time_ms = detail.rel_end_us.first() / 1000.0;
+  const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
+  const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
+  const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
+  const int64_t times_called = detail.times_called / num_runs();
+
+  std::stringstream stream;
+  InitField(stream, 14) << detail.type;
+  InitField(stream, 9) << start_ms;
+  InitField(stream, 9) << first_time_ms;
+  InitField(stream, 9) << avg_time_ms;
+  InitField(stream, 7) << percentage << "%";
+  InitField(stream, 7) << cdf_percentage << "%";
+  InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
+  InitField(stream, 9) << times_called;
+  stream << "\t" << detail.name;
+
+  return stream.str();
+}
+
+void StatSummarizer::OrderNodesByMetric(
+    SortingMetric metric, std::vector<const Detail *> *details) const {
+  std::priority_queue<std::pair<std::string, const Detail *>> sorted_list;
+  const int num_nodes = details_.size();
+
+  for (const auto &det : details_) {
+    const Detail *detail = &(det.second);
+    std::stringstream stream;
+    stream << std::setw(20) << std::right << std::setprecision(10)
+           << std::fixed;
+
+    switch (metric) {
+      case BY_NAME:
+        stream << detail->name;
+        break;
+      case BY_RUN_ORDER:
+        stream << num_nodes - detail->run_order;
+        break;
+      case BY_TIME:
+        stream << detail->rel_end_us.avg();
+        break;
+      case BY_MEMORY:
+        stream << detail->mem_used.avg();
+        break;
+      case BY_TYPE:
+        stream << detail->type;
+        break;
+      default:
+        stream << "";
+        break;
+    }
+
+    sorted_list.emplace(stream.str(), detail);
+  }
+
+  while (!sorted_list.empty()) {
+    auto entry = sorted_list.top();
+    sorted_list.pop();
+    details->push_back(entry.second);
+  }
+}
+
+void StatSummarizer::ComputeStatsByType(
+    std::map<std::string, int64_t> *node_type_map_count,
+    std::map<std::string, int64_t> *node_type_map_time,
+    std::map<std::string, int64_t> *node_type_map_memory,
+    std::map<std::string, int64_t> *node_type_map_times_called,
+    int64_t *accumulated_us) const {
+  int64_t run_count = run_total_us_.count();
+
+  for (const auto &det : details_) {
+    const std::string node_name = det.first;
+    const Detail &detail = det.second;
+
+    int64_t curr_time_val =
+        static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
+    *accumulated_us += curr_time_val;
+
+    int64_t curr_memory_val = detail.mem_used.newest();
+
+    const std::string &node_type = detail.type;
+
+    (*node_type_map_count)[node_type] += 1;
+    (*node_type_map_time)[node_type] += curr_time_val;
+    (*node_type_map_memory)[node_type] += curr_memory_val;
+    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
+  }
+}
+
+std::string StatSummarizer::GetStatsByNodeType() const {
+  std::stringstream stream;
+
+  stream << "============================== Summary by node type "
+            "=============================="
+         << std::endl;
+
+  LOG(INFO) << "Number of nodes executed: " << details_.size() << std::endl;
+
+  std::map<std::string, int64_t> node_type_map_count;
+  std::map<std::string, int64_t> node_type_map_time;
+  std::map<std::string, int64_t> node_type_map_memory;
+  std::map<std::string, int64_t> node_type_map_times_called;
+  int64_t accumulated_us = 0;
+
+  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                     &node_type_map_memory, &node_type_map_times_called,
+                     &accumulated_us);
+
+  // Sort them.
+  std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
+      timings;
+  for (const auto &node_type : node_type_map_time) {
+    const int64_t mem_used = node_type_map_memory[node_type.first];
+    timings.emplace(node_type.second,
+                    std::pair<std::string, int64_t>(node_type.first, mem_used));
+  }
+
+  InitField(stream, 14) << "[Node type]";
+  InitField(stream, 9) << "[count]";
+  InitField(stream, 10) << "[avg ms]";
+  InitField(stream, 11) << "[avg %]";
+  InitField(stream, 11) << "[cdf %]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 10) << "[times called]";
+  stream << std::endl;
+
+  float cdf = 0.0f;
+  while (!timings.empty()) {
+    auto entry = timings.top();
+    timings.pop();
+
+    const std::string node_type = entry.second.first;
+    const float memory = entry.second.second / 1000.0f;
+
+    const int64_t node_type_total_us = entry.first;
+    const float time_per_run_ms = node_type_total_us / 1000.0f;
+
+    const float percentage =
+        ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
+    cdf += percentage;
+
+    InitField(stream, 14) << node_type;
+    InitField(stream, 9) << node_type_map_count[node_type];
+    InitField(stream, 10) << time_per_run_ms;
+    InitField(stream, 10) << percentage << "%";
+    InitField(stream, 10) << cdf << "%";
+    InitField(stream, 10) << memory;
+    InitField(stream, 9) << node_type_map_times_called[node_type];
+    stream << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatSummarizer::GetStatsByMetric(const std::string &title,
+                                             SortingMetric sorting_metric,
+                                             int num_stats) const {
+  std::vector<const Detail *> details;
+  OrderNodesByMetric(sorting_metric, &details);
+
+  double cumulative_stat_on_node = 0;
+
+  std::stringstream stream;
+  stream << HeaderString(title) << std::endl;
+  int stat_num = 0;
+  for (auto detail : details) {
+    ++stat_num;
+    if (num_stats > 0 && stat_num > num_stats) {
+      break;
+    }
+
+    cumulative_stat_on_node += detail->rel_end_us.sum();
+    stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
+           << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatSummarizer::GetOutputString() const {
+  std::stringstream stream;
+  if (options_.show_run_order) {
+    stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
+                               options_.run_order_limit);
+  }
+  if (options_.show_time) {
+    stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
+                               options_.time_limit);
+  }
+  if (options_.show_memory) {
+    stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
+                               options_.memory_limit);
+  }
+  if (options_.show_type) {
+    stream << GetStatsByNodeType();
+  }
+  if (options_.show_summary) {
+    stream << ShortSummary() << std::endl;
+  }
+  return stream.str();
+}
+
+void StatSummarizer::PrintOperatorStats() const {
+  std::string output = GetOutputString();
+  std::istringstream iss(output);
+  for (std::string line; std::getline(iss, line);) {
+    LOG(INFO) << line;
+  }
+}
+
+}  // namespace mace
diff --git a/mace/benchmark/stat_summarizer.h b/mace/benchmark/stat_summarizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..368b1e30299b77467b847396eeb20df586895618
--- /dev/null
+++ b/mace/benchmark/stat_summarizer.h
@@ -0,0 +1,199 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+
+#include <stdlib.h>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace mace {
+
+class RunMetadata;
+
+template <typename ValueType, typename HighPrecisionValueType = double>
+class Stat {
+ public:
+  void UpdateStat(ValueType v) {
+    if (count_ == 0) {
+      first_ = v;
+    }
+
+    newest_ = v;
+    max_ = std::max(v, max_);
+    min_ = std::min(v, min_);
+    ++count_;
+    sum_ += v;
+    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
+  }
+
+  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
+
+  bool empty() const { return count_ == 0; }
+
+  ValueType first() const { return first_; }
+
+  ValueType newest() const { return newest_; }
+
+  ValueType max() const { return max_; }
+
+  ValueType min() const { return min_; }
+
+  int64_t count() const { return count_; }
+
+  ValueType sum() const { return sum_; }
+
+  HighPrecisionValueType squared_sum() const { return squared_sum_; }
+
+  bool all_same() const { return (count_ == 0 || min_ == max_); }
+
+  HighPrecisionValueType avg() const {
+    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
+                   : static_cast<HighPrecisionValueType>(sum_) / count_;
+  }
+
+  ValueType std_deviation() const {
+    return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg());
+  }
+
+  void OutputToStream(std::ostream *stream) const {
+    if (empty()) {
+      *stream << "count=0";
+    } else if (all_same()) {
+      *stream << "count=" << count_ << " curr=" << newest_;
+      if (count_ > 1) *stream << "(all same)";
+    } else {
+      *stream << "count=" << count_ << " first=" << first_
+              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
+              << " avg=" << avg() << " std=" << std_deviation();
+    }
+  }
+
+  friend std::ostream &operator<<(std::ostream &stream,
+                                  const Stat<ValueType> &stat) {
+    stat.OutputToStream(&stream);
+    return stream;
+  }
+
+ private:
+  ValueType first_ = 0;
+  ValueType newest_ = 0;
+  ValueType max_ = std::numeric_limits<ValueType>::min();
+  ValueType min_ = std::numeric_limits<ValueType>::max();
+  int64_t count_ = 0;
+  ValueType sum_ = 0;
+  HighPrecisionValueType squared_sum_ = 0;
+};
+
+// Used to control the output of the statistics summarizer;
+class StatSummarizerOptions {
+ public:
+  StatSummarizerOptions()
+      : show_run_order(true),
+        run_order_limit(0),
+        show_time(true),
+        time_limit(10),
+        show_memory(true),
+        memory_limit(10),
+        show_type(true),
+        show_summary(true) {}
+
+  bool show_run_order;
+  int run_order_limit;
+  bool show_time;
+  int time_limit;
+  bool show_memory;
+  int memory_limit;
+  bool show_type;
+  bool show_summary;
+};
+
+// A StatSummarizer assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc. across
+// multiple executions of a single Graph from the StepStats collected during
+// graph execution.
+//
+// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
+class StatSummarizer {
+ public:
+  enum SortingMetric {
+    BY_NAME,
+    BY_RUN_ORDER,
+    BY_TIME,
+    BY_MEMORY,
+    BY_TYPE,
+  };
+
+  explicit StatSummarizer(const StatSummarizerOptions &options);
+
+  ~StatSummarizer();
+
+  // Adds another run's StepStats output to the aggregate counts.
+  void ProcessMetadata(const RunMetadata &run_metadata);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const;
+
+  std::string ShortSummary() const;
+
+  // Prints the string returned by GetOutputString().
+  void PrintOperatorStats() const;
+
+  void ComputeStatsByType(
+      std::map<std::string, int64_t> *node_type_map_count,
+      std::map<std::string, int64_t> *node_type_map_time,
+      std::map<std::string, int64_t> *node_type_map_memory,
+      std::map<std::string, int64_t> *node_type_map_times_called,
+      int64_t *accumulated_us) const;
+
+  std::string GetStatsByNodeType() const;
+
+  std::string GetStatsByMetric(const std::string &title,
+                               SortingMetric sorting_metric,
+                               int num_stats) const;
+
+  void Reset();
+
+  // Returns number of runs.
+  int num_runs() const { return run_total_us_.count(); }
+
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t> &run_total_us() const { return run_total_us_; }
+
+ private:
+  struct Detail {
+    std::string name;
+    std::string type;
+    int64_t run_order;
+    Stat<int64_t> start_us;
+    Stat<int64_t> rel_end_us;
+    Stat<int64_t> mem_used;
+    int64_t times_called;
+  };
+
+  void OrderNodesByMetric(SortingMetric sorting_metric,
+                          std::vector<const Detail *> *details) const;
+
+  std::string HeaderString(const std::string &title) const;
+  std::string ColumnString(const Detail &detail,
+                           const int64_t cumulative_stat_on_node,
+                           const Stat<int64_t> &stat) const;
+
+  Stat<int64_t> run_total_us_;
+  Stat<int64_t> memory_;
+
+  std::map<std::string, Detail> details_;
+  StatSummarizerOptions options_;
+};
+
+}  //  namespace mace
+
+#endif  // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_