diff --git a/mace/examples/BUILD b/mace/examples/BUILD index ff47e1d98fcca18a96cee0c9a69bd5a7101554db..af806dcc06688984b7df812f1fe9dc9e77ed93d3 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -24,3 +24,14 @@ cc_test( "//mace/core:test_benchmark_main", ], ) + +cc_binary( + name = "mace_run", + srcs = ["mace_run.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + "//mace/codegen:generated_models", + "//external:gflags_nothreads", + ], +) diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c188c274b8fddb3fddb9c6007d2245a3cdebd06 --- /dev/null +++ b/mace/examples/mace_run.cc @@ -0,0 +1,407 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * mace_run --model=mobi_mace.pb \ + * --input=input_node \ + * --output=output_node \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --output_file=mace.out \ + * --model_data_file=model_data.data \ + * --device=OPENCL + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +using namespace std; +using namespace mace; + +namespace mace { +namespace MACE_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_MODEL_TAG +} // namespace mace + + +namespace str_util { + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + +} // namespace str_util + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!isalnum(res[i])) res[i] = '_'; + } + return res; +} + +DeviceType ParseDeviceType(const string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + +struct mallinfo LogMallinfoChange(struct mallinfo prev) { + struct mallinfo curr = mallinfo(); + if (prev.arena != curr.arena) { + LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena + << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena); + } + if (prev.ordblks != curr.ordblks) { + LOG(INFO) << "Number of free chunks: " << curr.ordblks + << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks); + } + if (prev.smblks != curr.smblks) { + LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks + << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks); + } + if (prev.hblks != curr.hblks) { + LOG(INFO) << "Number of mmapped regions: " << curr.hblks + << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks); + } + if (prev.hblkhd != curr.hblkhd) { + LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd + << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd); + } + if (prev.usmblks != curr.usmblks) { + LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks + << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks); + } + if (prev.fsmblks != curr.fsmblks) { + LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks + << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks); + } + if (prev.uordblks != curr.uordblks) { + LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks + << ", diff: " + << ((int64_t)curr.uordblks - (int64_t)prev.uordblks); + } + if (prev.fordblks != curr.fordblks) { + LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: " + << ((int64_t)curr.fordblks - (int64_t)prev.fordblks); + } + if (prev.keepcost != curr.keepcost) { + LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost + << ", diff: " + << ((int64_t)curr.keepcost - (int64_t)prev.keepcost); + } + return curr; +} + +DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); +DEFINE_string(input_shape, "1,224,224,3:1,1,1,10", "input shapes, separated by colon and comma"); +DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); +DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); +DEFINE_string(output_file, "", "output file name | output file prefix for multiple outputs"); +DEFINE_string(model_data_file, "", + "model data file name, used when EMBED_MODEL_DATA set to 0"); +DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); +DEFINE_int32(round, 1, "round"); +DEFINE_int32(restart_round, 1, "restart round"); +DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); + +bool SingleInputAndOutput(const std::vector &input_shape, + const std::vector &output_shape) { + // load model + int64_t t0 = NowMicros(); + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + int64_t t1 = NowMicros(); + LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us"; + int64_t init_micros = t1 - t0; + + DeviceType device_type = ParseDeviceType(FLAGS_device); + LOG(INFO) << "Runing with device type: " << device_type; + + // Init model + LOG(INFO) << "Run init"; + t0 = NowMicros(); + mace::MaceEngine engine(&net_def, device_type); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + t1 = NowMicros(); + init_micros += t1 - t0; + LOG(INFO) << "Net init latency: " << t1 - t0 << " us"; + LOG(INFO) << "Total init latency: " << init_micros << " us"; + + // Allocate input and output + int64_t input_size = + std::accumulate(input_shape.begin(), input_shape.end(), 1, + std::multiplies()); + int64_t output_size = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + std::unique_ptr input_data(new float[input_size]); + std::unique_ptr output_data(new float[output_size]); + + // load input + ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node), ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_data.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + LOG(INFO) << "Warm up run"; + t0 = NowMicros(); + engine.Run(input_data.get(), input_shape, output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; + + if (FLAGS_round > 0) { + LOG(INFO) << "Run model"; + t0 = NowMicros(); + struct mallinfo prev = mallinfo(); + for (int i = 0; i < FLAGS_round; ++i) { + engine.Run(input_data.get(), input_shape, output_data.get()); + if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) { + LOG(INFO) << "=== check malloc info change #" << i << " ==="; + prev = LogMallinfoChange(prev); + } + } + t1 = NowMicros(); + LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us"; + } + + if (FLAGS_restart_round == 1) { + if (output_data != nullptr) { + std::string + output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node); + ofstream out_file(output_name, ios::binary); + out_file.write((const char *) (output_data.get()), + output_size * sizeof(float)); + out_file.flush(); + out_file.close(); + LOG(INFO) << "Write output file " + << output_name + << " with size " << output_size + << " done."; + } else { + LOG(INFO) << "Output data is null"; + } + } + + return true; +} + +bool MultipleInputOrOutput(const std::vector &input_names, + const std::vector> &input_shapes, + const std::vector &output_names, + const std::vector> &output_shapes) { + // load model + int64_t t0 = NowMicros(); + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + int64_t t1 = NowMicros(); + LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us"; + int64_t init_micros = t1 - t0; + + DeviceType device_type = ParseDeviceType(FLAGS_device); + LOG(INFO) << "Runing with device type: " << device_type; + + // Init model + LOG(INFO) << "Run init"; + t0 = NowMicros(); + mace::MaceEngine engine(&net_def, device_type, input_names, output_names); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + t1 = NowMicros(); + init_micros += t1 - t0; + LOG(INFO) << "Net init latency: " << t1 - t0 << " us"; + LOG(INFO) << "Total init latency: " << init_micros << " us"; + + const size_t input_count = input_names.size(); + const size_t output_count = output_names.size(); + std::vector input_infos(input_count); + std::map outputs; + std::vector> input_datas(input_count); + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::multiplies()); + input_datas[i].reset(new float[input_size]); + // load input + ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_datas[i].get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + input_infos[i].name = input_names[i]; + input_infos[i].shape = input_shapes[i]; + input_infos[i].data = input_datas[i].get(); + } + std::vector> output_datas(output_count); + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + output_datas[i].reset(new float[output_size]); + outputs[output_names[i]] = output_datas[i].get(); + } + + LOG(INFO) << "Warm up run"; + t0 = NowMicros(); + engine.Run(input_infos, outputs); + t1 = NowMicros(); + LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; + + if (FLAGS_round > 0) { + LOG(INFO) << "Run model"; + t0 = NowMicros(); + struct mallinfo prev = mallinfo(); + for (int i = 0; i < FLAGS_round; ++i) { + engine.Run(input_infos, outputs); + if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) { + LOG(INFO) << "=== check malloc info change #" << i << " ==="; + prev = LogMallinfoChange(prev); + } + } + t1 = NowMicros(); + LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us"; + } + + for (size_t i = 0; i < output_count; ++i) { + std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]); + ofstream out_file(output_name, ios::binary); + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + out_file.write((const char *) outputs[output_names[i]], + output_size * sizeof(float)); + out_file.flush(); + out_file.close(); + LOG(INFO) << "Write output file " + << output_name + << " with size " << output_size + << " done."; + } + + return true; +} + +int main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "mace git version: " << MaceGitVersion(); + LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum(); + LOG(INFO) << "input node: " << FLAGS_input_node; + LOG(INFO) << "input shape: " << FLAGS_input_shape; + LOG(INFO) << "output node: " << FLAGS_output_node; + LOG(INFO) << "output shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "output_file: " << FLAGS_output_file; + LOG(INFO) << "model_data_file: " << FLAGS_model_data_file; + LOG(INFO) << "device: " << FLAGS_device; + LOG(INFO) << "round: " << FLAGS_restart_round; + LOG(INFO) << "restart_round: " << FLAGS_round; + + std::vector input_names = str_util::Split(FLAGS_input_node, ','); + std::vector output_names = str_util::Split(FLAGS_output_node, ','); + std::vector input_shapes = str_util::Split(FLAGS_input_shape, ':'); + std::vector output_shapes = str_util::Split(FLAGS_output_shape, ':'); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + bool ret; +#pragma omp parallel for + for (int i = 0; i < FLAGS_restart_round; ++i) { + VLOG(0) << "restart round " << i; + if (input_count == 1 && output_count == 1) { + ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]); + } else { + ret = MultipleInputOrOutput(input_names, + input_shape_vec, + output_names, + output_shape_vec); + } + } + if(ret) { + return 0; + } else { + return -1; + } +}