diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..41d9067871bf59b2f2bc3590147b2868226b1944 --- /dev/null +++ b/mace/benchmark/BUILD @@ -0,0 +1,66 @@ +# Benchmark +# Examples +load( + "//:mace.bzl", + "if_production_mode", + "if_not_production_mode", + "if_hexagon_enabled", + "if_openmp_enabled", +) + +licenses(["notice"]) # Apache 2.0 + +cc_library( + name = "stat_summarizer", + srcs = ["stat_summarizer.cc"], + hdrs = ["stat_summarizer.h"], + linkstatic = 1, + deps = [ + "@mace//:mace_headers", + ], +) + +cc_binary( + name = "benchmark_model", + srcs = [ + "benchmark_model.cc", + ], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + ":stat_summarizer", + "//codegen:generated_models", + "//external:gflags_nothreads", + ] + if_hexagon_enabled([ + "//lib/hexagon:hexagon", + ]) + if_production_mode([ + "@mace//:mace_prod", + "//codegen:generated_opencl_prod", + "//codegen:generated_tuning_params", + ]) + if_not_production_mode([ + "@mace//:mace_dev", + ]), +) + +cc_library( + name = "libmace_merged", + srcs = [ + "libmace_merged.a", + ], + visibility = ["//visibility:private"], +) + +cc_binary( + name = "model_throughput_test", + srcs = ["model_throughput_test.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + ":libmace_merged", + "//external:gflags_nothreads", + "//lib/hexagon", + "@mace//:mace", + "@mace//:mace_headers", + "@mace//:mace_prod", + ], +) diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc new file mode 100644 index 0000000000000000000000000000000000000000..26bfeb618bdeff76c56e9ceb2cb33930a43dc461 --- /dev/null +++ b/mace/benchmark/benchmark_model.cc @@ -0,0 +1,362 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/logging.h" +#include "benchmark/stat_summarizer.h" + +#include +#include +#include +#include +#include + +namespace mace { +namespace MACE_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} +} + +namespace mace { +namespace str_util { + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + +bool SplitAndParseToInts(const std::string &str, + char delims, + std::vector *result) { + std::string tmp = str; + while (!tmp.empty()) { + int64_t dim = atoi(tmp.data()); + result->push_back(dim); + size_t next_offset = tmp.find(delims); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return true; +} + +} // namespace str_util + +namespace benchmark { + +void ParseShape(const std::string &str, std::vector *shape) { + std::string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!::isalnum(res[i])) res[i] = '_'; + } + return res; +} + +inline int64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +} + +bool RunInference(MaceEngine *engine, + const std::vector &input_infos, + std::map &output_infos, + StatSummarizer *summarizer, + int64_t *inference_time_us) { + RunMetadata run_metadata; + RunMetadata *run_metadata_ptr = nullptr; + if (summarizer) { + run_metadata_ptr = &run_metadata; + } + if (input_infos.size() == 1 && output_infos.size() == 1) { + const int64_t start_time = NowMicros(); + bool s = engine->Run(input_infos[0].data, input_infos[0].shape, + output_infos.begin()->second, run_metadata_ptr); + const int64_t end_time = NowMicros(); + + if (!s) { + LOG(ERROR) << "Error during inference."; + return s; + } + *inference_time_us = end_time - start_time; + } else { + const int64_t start_time = NowMicros(); + bool s = engine->Run(input_infos, output_infos, run_metadata_ptr); + const int64_t end_time = NowMicros(); + + if (!s) { + LOG(ERROR) << "Error during inference."; + return s; + } + *inference_time_us = end_time - start_time; + } + + if (summarizer != nullptr) { + summarizer->ProcessMetadata(run_metadata); + } + + return true; +} + +bool Run(MaceEngine *engine, + const std::vector &input_infos, + std::map &output_infos, + StatSummarizer *summarizer, + int num_runs, + double max_time_sec, + int64_t sleep_sec, + int64_t *total_time_us, + int64_t *actual_num_runs) { + *total_time_us = 0; + + LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max " + << max_time_sec << " seconds " + << (summarizer != nullptr ? "with " : "without ") + << "detailed stat logging, with " << sleep_sec + << "s sleep between inferences"; + + Stat stat; + + bool util_max_time = (num_runs <= 0); + for (int i = 0; util_max_time || i < num_runs; ++i) { + int64_t inference_time_us = 0; + bool s = RunInference(engine, input_infos, output_infos, summarizer, &inference_time_us); + stat.UpdateStat(inference_time_us); + (*total_time_us) += inference_time_us; + ++(*actual_num_runs); + + if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) { + break; + } + + if (!s) { + LOG(INFO) << "Failed on run " << i; + return s; + } + + if (sleep_sec > 0) { + std::this_thread::sleep_for(std::chrono::seconds(sleep_sec)); + } + } + + std::stringstream stream; + stat.OutputToStream(&stream); + LOG(INFO) << stream.str(); + + return true; +} + +DEFINE_string(device, "CPU", "Device [CPU|OPENCL]"); +DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); +DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); +DEFINE_string(input_shape, "", "input shape, separated by colon and comma"); +DEFINE_string(output_shape, "", "output shape, separated by colon and comma"); +DEFINE_string(input_file, "", "input file name"); +DEFINE_int32(max_num_runs, 100, "number of runs max"); +DEFINE_string(max_time, "10.0", "length to run max"); +DEFINE_string(inference_delay, "-1", "delay between runs in seconds"); +DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds"); +DEFINE_string(benchmark_name, "", "benchmark name"); +DEFINE_bool(show_run_order, true, "whether to list stats by run order"); +DEFINE_int32(run_order_limit, 0, "how many items to show by run order"); +DEFINE_bool(show_time, true, "whether to list stats by time taken"); +DEFINE_int32(time_limit, 10, "how many items to show by time taken"); +DEFINE_bool(show_memory, false, "whether to list stats by memory used"); +DEFINE_int32(memory_limit, 10, "how many items to show by memory used"); +DEFINE_bool(show_type, true, "whether to list stats by op type"); +DEFINE_bool(show_summary, true, "whether to show a summary of the stats"); +DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs"); +DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); +DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); + +int Main(int argc, char **argv) { + MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP."); + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]"; + LOG(INFO) << "Device: [" << FLAGS_device << "]"; + LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]"; + LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]"; + LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]"; + LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]"; + LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]"; + LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]"; + LOG(INFO) << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]"; + LOG(INFO) << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]"; + + const long int inter_inference_sleep_seconds = + std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10); + const long int inter_benchmark_sleep_seconds = + std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10); + const double max_benchmark_time_seconds = + std::strtod(FLAGS_max_time.c_str(), nullptr); + + std::unique_ptr stats; + + StatSummarizerOptions stats_options; + stats_options.show_run_order = FLAGS_show_run_order; + stats_options.run_order_limit = FLAGS_run_order_limit; + stats_options.show_time = FLAGS_show_time; + stats_options.time_limit = FLAGS_time_limit; + stats_options.show_memory = FLAGS_show_memory; + stats_options.memory_limit = FLAGS_memory_limit; + stats_options.show_type = FLAGS_show_type; + stats_options.show_summary = FLAGS_show_summary; + stats.reset(new StatSummarizer(stats_options)); + + DeviceType device_type = CPU; + if(FLAGS_device == "OPENCL") { + device_type = OPENCL; + } + + std::vector input_names = str_util::Split(FLAGS_input_node, ','); + std::vector output_names = str_util::Split(FLAGS_output_node, ','); + std::vector input_shapes = str_util::Split(FLAGS_input_shape, ':'); + std::vector output_shapes = str_util::Split(FLAGS_output_shape, ':'); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + + std::vector input_infos(input_count); + std::map output_infos; + std::vector> input_datas(input_count); + std::vector> output_datas(output_count); + + for (size_t i = 0; i < input_count; ++i) { + int64_t input_size = std::accumulate(input_shape_vec[i].begin(), + input_shape_vec[i].end(), 1, std::multiplies()); + input_datas[i].reset(new float[input_size]); + // load input + std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_datas[i].get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + input_infos[i].name = input_names[i]; + input_infos[i].shape = input_shape_vec[i]; + input_infos[i].data = input_datas[i].get(); + } + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = std::accumulate(output_shape_vec[i].begin(), + output_shape_vec[i].end(), 1, std::multiplies()); + output_datas[i].reset(new float[output_size]); + output_infos[output_names[i]] = output_datas[i].get(); + } + + // Init model + LOG(INFO) << "Run init"; + std::unique_ptr engine_ptr; + if (input_count == 1 && output_count == 1) { + engine_ptr.reset(new mace::MaceEngine(&net_def, device_type)); + } else { + engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, input_names, output_names)); + } + if (device_type == DeviceType::OPENCL) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + + LOG(INFO) << "Warm up"; + + int64_t warmup_time_us = 0; + int64_t num_warmup_runs = 0; + if (FLAGS_warmup_runs > 0) { + bool status = + Run(engine_ptr.get(), input_infos, output_infos, nullptr, FLAGS_warmup_runs, -1.0, + inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs); + if (!status) { + LOG(ERROR) << "Failed at warm up run"; + } + } + + if (inter_benchmark_sleep_seconds > 0) { + std::this_thread::sleep_for( + std::chrono::seconds(inter_benchmark_sleep_seconds)); + } + int64_t no_stat_time_us = 0; + int64_t no_stat_runs = 0; + bool status = + Run(engine_ptr.get(), input_infos, output_infos, + nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds, + inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal no-stat run"; + } + + int64_t stat_time_us = 0; + int64_t stat_runs = 0; + status = Run(engine_ptr.get(), input_infos, output_infos, + stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds, + inter_inference_sleep_seconds, &stat_time_us, &stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal stat run"; + } + + LOG(INFO) << "Average inference timings in us: " + << "Warmup: " + << (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", " + << "no stats: " << no_stat_time_us / no_stat_runs << ", " + << "with stats: " << stat_time_us / stat_runs; + + stats->PrintOperatorStats(); + + return 0; +} + +} // namespace benchmark +} // namespace mace + +int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); } diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..48e622cb7b79b026b89ad6eb1fb4349056b2d9e8 --- /dev/null +++ b/mace/benchmark/model_throughput_test.cc @@ -0,0 +1,281 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * throughput_test \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --cpu_model_data_file=cpu_model_data.data \ + * --gpu_model_data_file=gpu_model_data.data \ + * --dsp_model_data_file=dsp_model_data.data \ + * --run_seconds=10 + */ +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +using namespace std; +using namespace mace; + +namespace mace { + +#ifdef MACE_CPU_MODEL_TAG +namespace MACE_CPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_CPU_MODEL_TAG +#endif + +#ifdef MACE_GPU_MODEL_TAG +namespace MACE_GPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_GPU_MODEL_TAG +#endif + +#ifdef MACE_DSP_MODEL_TAG +namespace MACE_DSP_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_DSP_MODEL_TAG +#endif + +} // namespace mace + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +DeviceType ParseDeviceType(const string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + +DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma"); +DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma"); +DEFINE_string(input_file, "", "input file name"); +DEFINE_string(cpu_model_data_file, "", "cpu model data file name"); +DEFINE_string(gpu_model_data_file, "", "gpu model data file name"); +DEFINE_string(dsp_model_data_file, "", "dsp model data file name"); +DEFINE_int32(run_seconds, 10, "run seconds"); + +int main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "mace git version: " << MaceGitVersion(); +#ifdef MACE_CPU_MODEL_TAG + LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_GPU_MODEL_TAG + LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_DSP_MODEL_TAG + LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum(); +#endif + LOG(INFO) << "input_shape: " << FLAGS_input_shape; + LOG(INFO) << "output_shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file; + LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file; + LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file; + LOG(INFO) << "run_seconds: " << FLAGS_run_seconds; + + vector input_shape_vec; + vector output_shape_vec; + ParseShape(FLAGS_input_shape, &input_shape_vec); + ParseShape(FLAGS_output_shape, &output_shape_vec); + + int64_t input_size = + std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1, + std::multiplies()); + int64_t output_size = + std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1, + std::multiplies()); + std::unique_ptr input_data(new float[input_size]); + std::unique_ptr cpu_output_data(new float[output_size]); + std::unique_ptr gpu_output_data(new float[output_size]); + std::unique_ptr dsp_output_data(new float[output_size]); + + // load input + ifstream in_file(FLAGS_input_file, ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_data.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + int64_t t0, t1, init_micros; +#ifdef MACE_CPU_MODEL_TAG + /* --------------------- CPU init ----------------------- */ + LOG(INFO) << "Load & init cpu model and warm up"; + const unsigned char *cpu_model_data = + mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str()); + NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data); + + mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU); + + LOG(INFO) << "CPU Warm up run"; + t0 = NowMicros(); + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_GPU_MODEL_TAG + /* --------------------- GPU init ----------------------- */ + LOG(INFO) << "Load & init gpu model and warm up"; + const unsigned char *gpu_model_data = + mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); + + mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL); + mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); + + LOG(INFO) << "GPU Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_DSP_MODEL_TAG + /* --------------------- DSP init ----------------------- */ + LOG(INFO) << "Load & init dsp model and warm up"; + const unsigned char *dsp_model_data = + mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data); + + mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON); + mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data); + + LOG(INFO) << "DSP Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us"; +#endif + + double cpu_throughput = 0; + double gpu_throughput = 0; + double dsp_throughput = 0; + int64_t run_micros = FLAGS_run_seconds * 1000000; + +#ifdef MACE_CPU_MODEL_TAG + std::thread cpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + cpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_GPU_MODEL_TAG + std::thread gpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + gpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_DSP_MODEL_TAG + std::thread dsp_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + dsp_throughput = frames * 1000000.0 / micros; + }); +#endif + + double total_throughput = 0; + +#ifdef MACE_CPU_MODEL_TAG + cpu_thread.join(); + LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s"; + total_throughput += cpu_throughput; +#endif +#ifdef MACE_GPU_MODEL_TAG + gpu_thread.join(); + LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s"; + total_throughput += gpu_throughput; +#endif +#ifdef MACE_DSP_MODEL_TAG + dsp_thread.join(); + LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s"; + total_throughput += dsp_throughput; +#endif + + LOG(INFO) << "Total throughput: " << total_throughput << " f/s"; +} diff --git a/mace/benchmark/stat_summarizer.cc b/mace/benchmark/stat_summarizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..68801fb26aef120b74a23cf561056f9f57398c6f --- /dev/null +++ b/mace/benchmark/stat_summarizer.cc @@ -0,0 +1,320 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "benchmark/stat_summarizer.h" +#include "mace/public/mace.h" +#include "mace/utils/logging.h" + +#include +#include +#include + +namespace mace { + +StatSummarizer::StatSummarizer(const StatSummarizerOptions &options) + : options_(options) {} + +StatSummarizer::~StatSummarizer() {} + +void StatSummarizer::Reset() { + run_total_us_.Reset(); + memory_.Reset(); + details_.clear(); +} + +void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) { + int64_t curr_total_us = 0; + int64_t mem_total = 0; + + if (run_metadata.op_stats.empty()) { + std::cerr << "Runtime op stats should not be empty" << std::endl; + abort(); + } + int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros; + + int node_num = 0; + for (const auto &ops : run_metadata.op_stats) { + std::string name = ops.operator_name; + std::string op_type = ops.type; + + ++node_num; + const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros; + curr_total_us += curr_time; + auto result = details_.emplace(name, Detail()); + Detail *detail = &(result.first->second); + + detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us); + detail->rel_end_us.UpdateStat(curr_time); + + // If this is the first pass, initialize some values. + if (result.second) { + detail->name = name; + detail->type = op_type; + + detail->run_order = node_num; + + detail->times_called = 0; + } + + ++detail->times_called; + } + + run_total_us_.UpdateStat(curr_total_us); + memory_.UpdateStat(mem_total); +} + +std::string StatSummarizer::ShortSummary() const { + std::stringstream stream; + stream << "Timings (microseconds): "; + run_total_us_.OutputToStream(&stream); + stream << std::endl; + + stream << "Memory (bytes): "; + memory_.OutputToStream(&stream); + stream << std::endl; + + stream << details_.size() << " nodes observed" << std::endl; + return stream.str(); +} + +std::ostream &InitField(std::ostream &stream, int width) { + stream << "\t" << std::right << std::setw(width) << std::fixed + << std::setprecision(3); + return stream; +} + +std::string StatSummarizer::HeaderString(const std::string &title) const { + std::stringstream stream; + + stream << "============================== " << title + << " ==============================" << std::endl; + + InitField(stream, 14) << "[node type]"; + InitField(stream, 9) << "[start]"; + InitField(stream, 9) << "[first]"; + InitField(stream, 9) << "[avg ms]"; + InitField(stream, 8) << "[%]"; + InitField(stream, 8) << "[cdf%]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 9) << "[times called]"; + stream << "\t" + << "[Name]"; + return stream.str(); +} + +std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail, + const int64_t cumulative_stat_on_node, + const Stat &stat) const { + const double start_ms = detail.start_us.avg() / 1000.0; + const double first_time_ms = detail.rel_end_us.first() / 1000.0; + const double avg_time_ms = detail.rel_end_us.avg() / 1000.0; + const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum(); + const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum(); + const int64_t times_called = detail.times_called / num_runs(); + + std::stringstream stream; + InitField(stream, 14) << detail.type; + InitField(stream, 9) << start_ms; + InitField(stream, 9) << first_time_ms; + InitField(stream, 9) << avg_time_ms; + InitField(stream, 7) << percentage << "%"; + InitField(stream, 7) << cdf_percentage << "%"; + InitField(stream, 10) << detail.mem_used.newest() / 1000.0; + InitField(stream, 9) << times_called; + stream << "\t" << detail.name; + + return stream.str(); +} + +void StatSummarizer::OrderNodesByMetric( + SortingMetric metric, std::vector *details) const { + std::priority_queue> sorted_list; + const int num_nodes = details_.size(); + + for (const auto &det : details_) { + const Detail *detail = &(det.second); + std::stringstream stream; + stream << std::setw(20) << std::right << std::setprecision(10) + << std::fixed; + + switch (metric) { + case BY_NAME: + stream << detail->name; + break; + case BY_RUN_ORDER: + stream << num_nodes - detail->run_order; + break; + case BY_TIME: + stream << detail->rel_end_us.avg(); + break; + case BY_MEMORY: + stream << detail->mem_used.avg(); + break; + case BY_TYPE: + stream << detail->type; + break; + default: + stream << ""; + break; + } + + sorted_list.emplace(stream.str(), detail); + } + + while (!sorted_list.empty()) { + auto entry = sorted_list.top(); + sorted_list.pop(); + details->push_back(entry.second); + } +} + +void StatSummarizer::ComputeStatsByType( + std::map *node_type_map_count, + std::map *node_type_map_time, + std::map *node_type_map_memory, + std::map *node_type_map_times_called, + int64_t *accumulated_us) const { + int64_t run_count = run_total_us_.count(); + + for (const auto &det : details_) { + const std::string node_name = det.first; + const Detail &detail = det.second; + + int64_t curr_time_val = + static_cast(detail.rel_end_us.sum() / run_count); + *accumulated_us += curr_time_val; + + int64_t curr_memory_val = detail.mem_used.newest(); + + const std::string &node_type = detail.type; + + (*node_type_map_count)[node_type] += 1; + (*node_type_map_time)[node_type] += curr_time_val; + (*node_type_map_memory)[node_type] += curr_memory_val; + (*node_type_map_times_called)[node_type] += detail.times_called / run_count; + } +} + +std::string StatSummarizer::GetStatsByNodeType() const { + std::stringstream stream; + + stream << "============================== Summary by node type " + "==============================" + << std::endl; + + LOG(INFO) << "Number of nodes executed: " << details_.size() << std::endl; + + std::map node_type_map_count; + std::map node_type_map_time; + std::map node_type_map_memory; + std::map node_type_map_times_called; + int64_t accumulated_us = 0; + + ComputeStatsByType(&node_type_map_count, &node_type_map_time, + &node_type_map_memory, &node_type_map_times_called, + &accumulated_us); + + // Sort them. + std::priority_queue>> + timings; + for (const auto &node_type : node_type_map_time) { + const int64_t mem_used = node_type_map_memory[node_type.first]; + timings.emplace(node_type.second, + std::pair(node_type.first, mem_used)); + } + + InitField(stream, 14) << "[Node type]"; + InitField(stream, 9) << "[count]"; + InitField(stream, 10) << "[avg ms]"; + InitField(stream, 11) << "[avg %]"; + InitField(stream, 11) << "[cdf %]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 10) << "[times called]"; + stream << std::endl; + + float cdf = 0.0f; + while (!timings.empty()) { + auto entry = timings.top(); + timings.pop(); + + const std::string node_type = entry.second.first; + const float memory = entry.second.second / 1000.0f; + + const int64_t node_type_total_us = entry.first; + const float time_per_run_ms = node_type_total_us / 1000.0f; + + const float percentage = + ((entry.first / static_cast(accumulated_us)) * 100.0f); + cdf += percentage; + + InitField(stream, 14) << node_type; + InitField(stream, 9) << node_type_map_count[node_type]; + InitField(stream, 10) << time_per_run_ms; + InitField(stream, 10) << percentage << "%"; + InitField(stream, 10) << cdf << "%"; + InitField(stream, 10) << memory; + InitField(stream, 9) << node_type_map_times_called[node_type]; + stream << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetStatsByMetric(const std::string &title, + SortingMetric sorting_metric, + int num_stats) const { + std::vector details; + OrderNodesByMetric(sorting_metric, &details); + + double cumulative_stat_on_node = 0; + + std::stringstream stream; + stream << HeaderString(title) << std::endl; + int stat_num = 0; + for (auto detail : details) { + ++stat_num; + if (num_stats > 0 && stat_num > num_stats) { + break; + } + + cumulative_stat_on_node += detail->rel_end_us.sum(); + stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_) + << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetOutputString() const { + std::stringstream stream; + if (options_.show_run_order) { + stream << GetStatsByMetric("Run Order", BY_RUN_ORDER, + options_.run_order_limit); + } + if (options_.show_time) { + stream << GetStatsByMetric("Top by Computation Time", BY_TIME, + options_.time_limit); + } + if (options_.show_memory) { + stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY, + options_.memory_limit); + } + if (options_.show_type) { + stream << GetStatsByNodeType(); + } + if (options_.show_summary) { + stream << ShortSummary() << std::endl; + } + return stream.str(); +} + +void StatSummarizer::PrintOperatorStats() const { + std::string output = GetOutputString(); + std::istringstream iss(output); + for (std::string line; std::getline(iss, line);) { + LOG(INFO) << line; + } +} + +} // namespace mace diff --git a/mace/benchmark/stat_summarizer.h b/mace/benchmark/stat_summarizer.h new file mode 100644 index 0000000000000000000000000000000000000000..368b1e30299b77467b847396eeb20df586895618 --- /dev/null +++ b/mace/benchmark/stat_summarizer.h @@ -0,0 +1,199 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ +#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace mace { + +class RunMetadata; + +template +class Stat { + public: + void UpdateStat(ValueType v) { + if (count_ == 0) { + first_ = v; + } + + newest_ = v; + max_ = std::max(v, max_); + min_ = std::min(v, min_); + ++count_; + sum_ += v; + squared_sum_ += static_cast(v) * v; + } + + void Reset() { new (this) Stat(); } + + bool empty() const { return count_ == 0; } + + ValueType first() const { return first_; } + + ValueType newest() const { return newest_; } + + ValueType max() const { return max_; } + + ValueType min() const { return min_; } + + int64_t count() const { return count_; } + + ValueType sum() const { return sum_; } + + HighPrecisionValueType squared_sum() const { return squared_sum_; } + + bool all_same() const { return (count_ == 0 || min_ == max_); } + + HighPrecisionValueType avg() const { + return empty() ? std::numeric_limits::quiet_NaN() + : static_cast(sum_) / count_; + } + + ValueType std_deviation() const { + return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg()); + } + + void OutputToStream(std::ostream *stream) const { + if (empty()) { + *stream << "count=0"; + } else if (all_same()) { + *stream << "count=" << count_ << " curr=" << newest_; + if (count_ > 1) *stream << "(all same)"; + } else { + *stream << "count=" << count_ << " first=" << first_ + << " curr=" << newest_ << " min=" << min_ << " max=" << max_ + << " avg=" << avg() << " std=" << std_deviation(); + } + } + + friend std::ostream &operator<<(std::ostream &stream, + const Stat &stat) { + stat.OutputToStream(&stream); + return stream; + } + + private: + ValueType first_ = 0; + ValueType newest_ = 0; + ValueType max_ = std::numeric_limits::min(); + ValueType min_ = std::numeric_limits::max(); + int64_t count_ = 0; + ValueType sum_ = 0; + HighPrecisionValueType squared_sum_ = 0; +}; + +// Used to control the output of the statistics summarizer; +class StatSummarizerOptions { + public: + StatSummarizerOptions() + : show_run_order(true), + run_order_limit(0), + show_time(true), + time_limit(10), + show_memory(true), + memory_limit(10), + show_type(true), + show_summary(true) {} + + bool show_run_order; + int run_order_limit; + bool show_time; + int time_limit; + bool show_memory; + int memory_limit; + bool show_type; + bool show_summary; +}; + +// A StatSummarizer assists in performance analysis of Graph executions. +// +// It summarizes time spent executing (on GPU/CPU), memory used etc. across +// multiple executions of a single Graph from the StepStats collected during +// graph execution. +// +// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage. +class StatSummarizer { + public: + enum SortingMetric { + BY_NAME, + BY_RUN_ORDER, + BY_TIME, + BY_MEMORY, + BY_TYPE, + }; + + explicit StatSummarizer(const StatSummarizerOptions &options); + + ~StatSummarizer(); + + // Adds another run's StepStats output to the aggregate counts. + void ProcessMetadata(const RunMetadata &run_metadata); + + // Returns a string detailing the accumulated runtime stats in a tab-separated + // format which can be pasted into a spreadsheet for further analysis. + std::string GetOutputString() const; + + std::string ShortSummary() const; + + // Prints the string returned by GetOutputString(). + void PrintOperatorStats() const; + + void ComputeStatsByType( + std::map *node_type_map_count, + std::map *node_type_map_time, + std::map *node_type_map_memory, + std::map *node_type_map_times_called, + int64_t *accumulated_us) const; + + std::string GetStatsByNodeType() const; + + std::string GetStatsByMetric(const std::string &title, + SortingMetric sorting_metric, + int num_stats) const; + + void Reset(); + + // Returns number of runs. + int num_runs() const { return run_total_us_.count(); } + + // Returns stats of total microseconds spent by all nodes in each run. + const Stat &run_total_us() const { return run_total_us_; } + + private: + struct Detail { + std::string name; + std::string type; + int64_t run_order; + Stat start_us; + Stat rel_end_us; + Stat mem_used; + int64_t times_called; + }; + + void OrderNodesByMetric(SortingMetric sorting_metric, + std::vector *details) const; + + std::string HeaderString(const std::string &title) const; + std::string ColumnString(const Detail &detail, + const int64_t cumulative_stat_on_node, + const Stat &stat) const; + + Stat run_total_us_; + Stat memory_; + + std::map details_; + StatSummarizerOptions options_; +}; + +} // namespace mace + +#endif // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_