From 37fe2203aad248a5448d2c378363d31aafa85dd7 Mon Sep 17 00:00:00 2001 From: wuchenghui Date: Thu, 8 Mar 2018 17:09:12 +0800 Subject: [PATCH] CPU+GPU+DSP full throughput benchmark --- BUILD | 27 +++- model_throughput_test.cc | 281 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 model_throughput_test.cc diff --git a/BUILD b/BUILD index ff7c6e43..41d90678 100644 --- a/BUILD +++ b/BUILD @@ -17,7 +17,7 @@ cc_library( linkstatic = 1, deps = [ "@mace//:mace_headers", - ] + ], ) cc_binary( @@ -33,7 +33,7 @@ cc_binary( "//external:gflags_nothreads", ] + if_hexagon_enabled([ "//lib/hexagon:hexagon", - ])+ if_production_mode([ + ]) + if_production_mode([ "@mace//:mace_prod", "//codegen:generated_opencl_prod", "//codegen:generated_tuning_params", @@ -41,3 +41,26 @@ cc_binary( "@mace//:mace_dev", ]), ) + +cc_library( + name = "libmace_merged", + srcs = [ + "libmace_merged.a", + ], + visibility = ["//visibility:private"], +) + +cc_binary( + name = "model_throughput_test", + srcs = ["model_throughput_test.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + ":libmace_merged", + "//external:gflags_nothreads", + "//lib/hexagon", + "@mace//:mace", + "@mace//:mace_headers", + "@mace//:mace_prod", + ], +) diff --git a/model_throughput_test.cc b/model_throughput_test.cc new file mode 100644 index 00000000..48e622cb --- /dev/null +++ b/model_throughput_test.cc @@ -0,0 +1,281 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * throughput_test \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --cpu_model_data_file=cpu_model_data.data \ + * --gpu_model_data_file=gpu_model_data.data \ + * --dsp_model_data_file=dsp_model_data.data \ + * --run_seconds=10 + */ +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +using namespace std; +using namespace mace; + +namespace mace { + +#ifdef MACE_CPU_MODEL_TAG +namespace MACE_CPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_CPU_MODEL_TAG +#endif + +#ifdef MACE_GPU_MODEL_TAG +namespace MACE_GPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_GPU_MODEL_TAG +#endif + +#ifdef MACE_DSP_MODEL_TAG +namespace MACE_DSP_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_DSP_MODEL_TAG +#endif + +} // namespace mace + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +DeviceType ParseDeviceType(const string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + +DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma"); +DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma"); +DEFINE_string(input_file, "", "input file name"); +DEFINE_string(cpu_model_data_file, "", "cpu model data file name"); +DEFINE_string(gpu_model_data_file, "", "gpu model data file name"); +DEFINE_string(dsp_model_data_file, "", "dsp model data file name"); +DEFINE_int32(run_seconds, 10, "run seconds"); + +int main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "mace git version: " << MaceGitVersion(); +#ifdef MACE_CPU_MODEL_TAG + LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_GPU_MODEL_TAG + LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_DSP_MODEL_TAG + LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum(); +#endif + LOG(INFO) << "input_shape: " << FLAGS_input_shape; + LOG(INFO) << "output_shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file; + LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file; + LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file; + LOG(INFO) << "run_seconds: " << FLAGS_run_seconds; + + vector input_shape_vec; + vector output_shape_vec; + ParseShape(FLAGS_input_shape, &input_shape_vec); + ParseShape(FLAGS_output_shape, &output_shape_vec); + + int64_t input_size = + std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1, + std::multiplies()); + int64_t output_size = + std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1, + std::multiplies()); + std::unique_ptr input_data(new float[input_size]); + std::unique_ptr cpu_output_data(new float[output_size]); + std::unique_ptr gpu_output_data(new float[output_size]); + std::unique_ptr dsp_output_data(new float[output_size]); + + // load input + ifstream in_file(FLAGS_input_file, ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_data.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + int64_t t0, t1, init_micros; +#ifdef MACE_CPU_MODEL_TAG + /* --------------------- CPU init ----------------------- */ + LOG(INFO) << "Load & init cpu model and warm up"; + const unsigned char *cpu_model_data = + mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str()); + NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data); + + mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU); + + LOG(INFO) << "CPU Warm up run"; + t0 = NowMicros(); + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_GPU_MODEL_TAG + /* --------------------- GPU init ----------------------- */ + LOG(INFO) << "Load & init gpu model and warm up"; + const unsigned char *gpu_model_data = + mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); + + mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL); + mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); + + LOG(INFO) << "GPU Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_DSP_MODEL_TAG + /* --------------------- DSP init ----------------------- */ + LOG(INFO) << "Load & init dsp model and warm up"; + const unsigned char *dsp_model_data = + mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data); + + mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON); + mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data); + + LOG(INFO) << "DSP Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us"; +#endif + + double cpu_throughput = 0; + double gpu_throughput = 0; + double dsp_throughput = 0; + int64_t run_micros = FLAGS_run_seconds * 1000000; + +#ifdef MACE_CPU_MODEL_TAG + std::thread cpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + cpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_GPU_MODEL_TAG + std::thread gpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + gpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_DSP_MODEL_TAG + std::thread dsp_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + dsp_throughput = frames * 1000000.0 / micros; + }); +#endif + + double total_throughput = 0; + +#ifdef MACE_CPU_MODEL_TAG + cpu_thread.join(); + LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s"; + total_throughput += cpu_throughput; +#endif +#ifdef MACE_GPU_MODEL_TAG + gpu_thread.join(); + LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s"; + total_throughput += gpu_throughput; +#endif +#ifdef MACE_DSP_MODEL_TAG + dsp_thread.join(); + LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s"; + total_throughput += dsp_throughput; +#endif + + LOG(INFO) << "Total throughput: " << total_throughput << " f/s"; +} -- GitLab