diff --git a/WORKSPACE b/WORKSPACE index fe00c16fecc68cd54f01465f0a8a3bb57fba6278..af04846f4460f86cf64907ce658911f2ded3fdd9 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -56,3 +56,20 @@ android_ndk_repository( # Android 5.0 api_level = 21 ) + +git_repository( + name = "com_github_gflags_gflags", + #tag = "v2.2.0", + commit = "30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e", # v2.2.0 + fix of include path + remote = "https://github.com/gflags/gflags.git" +) + +bind( + name = "gflags", + actual = "@com_github_gflags_gflags//:gflags", +) + +bind( + name = "gflags_nothreads", + actual = "@com_github_gflags_gflags//:gflags_nothreads", +) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..498d1ea006d692ac10ec211fb9e99c6b25b4986d --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,125 @@ +FROM ubuntu:16.04 + +# Update source +# Looks like mirrors.163.com does not work in xiaomi network +# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list +RUN apt-get update -y + +## Basic tools +RUN apt-get install -y --no-install-recommends apt-utils +RUN apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + curl \ + git \ + libcurl3-dev \ + libgoogle-glog-dev \ + libfreetype6-dev \ + libpng12-dev \ + libprotobuf-dev \ + libzmq3-dev \ + pkg-config \ + python-dev \ + python-pip \ + protobuf-compiler \ + rsync \ + software-properties-common \ + unzip \ + zip \ + zlib1g-dev \ + openjdk-8-jdk \ + openjdk-8-jre-headless \ + openssh-server \ + wget +RUN pip install --upgrade pip + +ENV ANDROID_NDK_HOME /opt/android-ndk +ENV ANDROID_NDK /opt/android-ndk +ENV ANDROID_NDK_VERSION r15c + +# Android NDK +# download +RUN mkdir /opt/android-ndk-tmp && \ + cd /opt/android-ndk-tmp && \ + wget -q https://dl.google.com/android/repository/android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \ +# uncompress + unzip -q android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \ +# move to its final location + mv ./android-ndk-${ANDROID_NDK_VERSION} ${ANDROID_NDK_HOME} && \ +# remove temp dir + cd ${ANDROID_NDK_HOME} && \ + rm -rf /opt/android-ndk-tmp + +# add to PATH +ENV PATH ${PATH}:${ANDROID_NDK_HOME} + +# Set up Bazel. +# Running bazel inside a `docker build` command causes trouble, cf: +# https://github.com/bazelbuild/bazel/issues/134 +# The easiest solution is to set up a bazelrc file forcing --batch. +RUN echo "startup --batch" >>/etc/bazel.bazelrc +# Similarly, we need to workaround sandboxing issues: +# https://github.com/bazelbuild/bazel/issues/418 +RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ + >>/etc/bazel.bazelrc +# Install the most recent bazel release. +ENV BAZEL_VERSION 0.7.0 +WORKDIR / +RUN mkdir /bazel && \ + cd /bazel && \ + wget https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \ + chmod +x bazel-*.sh && \ + ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \ + cd / && \ + rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh + +# Setup vim +RUN apt-get update -y +RUN apt-get install -y --no-install-recommends \ + locales \ + vim + +RUN mkdir -p ~/.vim/autoload ~/.vim/bundle && \ + curl -LSso ~/.vim/autoload/pathogen.vim https://tpo.pe/pathogen.vim + +RUN mkdir -p ~/.vim/bundle +RUN cd ~/.vim/bundle && \ + git clone https://github.com/scrooloose/nerdtree.git && \ + git clone https://github.com/vim-syntastic/syntastic.git && \ + git clone https://github.com/vim-airline/vim-airline.git && \ + git clone https://github.com/altercation/vim-colors-solarized.git && \ + git clone https://github.com/bazelbuild/vim-ft-bzl.git && \ + git clone https://github.com/google/vim-maktaba.git && \ + git clone https://github.com/google/vim-codefmt.git + +RUN curl -LSso ~/.vimrc https://gist.githubusercontent.com/llhe/71c5802919debd5825c100c0135478a7/raw/16a35020cc65f9c25d0cf8f11a3ba7b345a1271d/.vimrc + +RUN locale-gen en_US.UTF-8 +ENV LC_CTYPE=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 +ENV TERM xterm-256color + +# Extra packges +RUN apt-get install -y --no-install-recommends \ + telnet \ + net-tools \ + inetutils-ping \ + screen \ + android-tools-adb + +# Install tools +RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools +RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.4.0 \ + scipy \ + jinja2 \ + pyyaml + +# Download tensorflow tools +RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \ + chmod +x transform_graph + +# Install gitlab runner +RUN curl -L https://packages.gitlab.com/install/repositories/runner/gitlab-ci-multi-runner/script.deb.sh | bash +RUN apt-get install gitlab-ci-multi-runner + +ENTRYPOINT gitlab-runner run diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5c919e419baf10ea961c29cc6bd422a487922cb --- /dev/null +++ b/docker/README.md @@ -0,0 +1,27 @@ +# 包含mace环境的docker镜像 +======== + +* Login in [小米容器仓库](http://docs.api.xiaomi.net/docker-registry/) + + ``` + docker login cr.d.xiaomi.net + ``` + +* 使用`Dockerfile`编译镜像 + + ``` + docker build -t cr.d.xiaomi.net/mace/mace-dev . + ``` + +* 或者从镜像仓库直接pull镜像 + + ``` + docker push cr.d.xiaomi.net/mace/mace-dev + ``` + +* 启动容器 + + ``` + # Set 'host' network to use ADB + docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash + ``` diff --git a/docker/caffe/Dockerfile b/docker/caffe/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..4bf9e94b7ef550ca7230ce87579a1ccd7af18409 --- /dev/null +++ b/docker/caffe/Dockerfile @@ -0,0 +1,46 @@ +FROM ubuntu:16.04 +LABEL maintainer caffe-maint@googlegroups.com + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + wget \ + libatlas-base-dev \ + libboost-all-dev \ + libgflags-dev \ + libgoogle-glog-dev \ + libhdf5-serial-dev \ + libleveldb-dev \ + liblmdb-dev \ + libopencv-dev \ + libprotobuf-dev \ + libsnappy-dev \ + protobuf-compiler \ + python-dev \ + python-numpy \ + python-pip \ + python-setuptools \ + python-scipy && \ + rm -rf /var/lib/apt/lists/* + +ENV CAFFE_ROOT=/opt/caffe +WORKDIR $CAFFE_ROOT + +# FIXME: use ARG instead of ENV once DockerHub supports this +# https://github.com/docker/hub-feedback/issues/460 +ENV CLONE_TAG=1.0 + +RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \ + pip install --upgrade pip && \ + cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \ + mkdir build && cd build && \ + cmake -DCPU_ONLY=1 .. && \ + make -j"$(nproc)" + +ENV PYCAFFE_ROOT $CAFFE_ROOT/python +ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH +ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH +RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig + +WORKDIR /mace diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..457922879b8b4c630c70ed852e8a4e3138bd6793 --- /dev/null +++ b/mace/benchmark/BUILD @@ -0,0 +1,66 @@ +# Benchmark +# Examples +load( + "//:mace.bzl", + "if_production_mode", + "if_not_production_mode", + "if_hexagon_enabled", + "if_openmp_enabled", +) + +licenses(["notice"]) # Apache 2.0 + +cc_library( + name = "stat_summarizer", + srcs = ["stat_summarizer.cc"], + hdrs = ["stat_summarizer.h"], + linkstatic = 1, + deps = [ + "@mace//:mace_headers", + ], +) + +cc_binary( + name = "benchmark_model", + srcs = [ + "benchmark_model.cc", + ], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + ":stat_summarizer", + "//mace/codegen:generated_models", + "//external:gflags_nothreads", + ] + if_hexagon_enabled([ + "//lib/hexagon:hexagon", + ]) + if_production_mode([ + "@mace//:mace_prod", + "//codegen:generated_opencl_prod", + "//codegen:generated_tuning_params", + ]) + if_not_production_mode([ + "@mace//:mace_dev", + ]), +) + +cc_library( + name = "libmace_merged", + srcs = [ + "libmace_merged.a", + ], + visibility = ["//visibility:private"], +) + +cc_binary( + name = "model_throughput_test", + srcs = ["model_throughput_test.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + ":libmace_merged", + "//external:gflags_nothreads", + "//lib/hexagon", + "@mace//:mace", + "@mace//:mace_headers", + "@mace//:mace_prod", + ], +) diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc new file mode 100644 index 0000000000000000000000000000000000000000..26bfeb618bdeff76c56e9ceb2cb33930a43dc461 --- /dev/null +++ b/mace/benchmark/benchmark_model.cc @@ -0,0 +1,362 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/logging.h" +#include "benchmark/stat_summarizer.h" + +#include +#include +#include +#include +#include + +namespace mace { +namespace MACE_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} +} + +namespace mace { +namespace str_util { + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + +bool SplitAndParseToInts(const std::string &str, + char delims, + std::vector *result) { + std::string tmp = str; + while (!tmp.empty()) { + int64_t dim = atoi(tmp.data()); + result->push_back(dim); + size_t next_offset = tmp.find(delims); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return true; +} + +} // namespace str_util + +namespace benchmark { + +void ParseShape(const std::string &str, std::vector *shape) { + std::string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!::isalnum(res[i])) res[i] = '_'; + } + return res; +} + +inline int64_t NowMicros() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +} + +bool RunInference(MaceEngine *engine, + const std::vector &input_infos, + std::map &output_infos, + StatSummarizer *summarizer, + int64_t *inference_time_us) { + RunMetadata run_metadata; + RunMetadata *run_metadata_ptr = nullptr; + if (summarizer) { + run_metadata_ptr = &run_metadata; + } + if (input_infos.size() == 1 && output_infos.size() == 1) { + const int64_t start_time = NowMicros(); + bool s = engine->Run(input_infos[0].data, input_infos[0].shape, + output_infos.begin()->second, run_metadata_ptr); + const int64_t end_time = NowMicros(); + + if (!s) { + LOG(ERROR) << "Error during inference."; + return s; + } + *inference_time_us = end_time - start_time; + } else { + const int64_t start_time = NowMicros(); + bool s = engine->Run(input_infos, output_infos, run_metadata_ptr); + const int64_t end_time = NowMicros(); + + if (!s) { + LOG(ERROR) << "Error during inference."; + return s; + } + *inference_time_us = end_time - start_time; + } + + if (summarizer != nullptr) { + summarizer->ProcessMetadata(run_metadata); + } + + return true; +} + +bool Run(MaceEngine *engine, + const std::vector &input_infos, + std::map &output_infos, + StatSummarizer *summarizer, + int num_runs, + double max_time_sec, + int64_t sleep_sec, + int64_t *total_time_us, + int64_t *actual_num_runs) { + *total_time_us = 0; + + LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max " + << max_time_sec << " seconds " + << (summarizer != nullptr ? "with " : "without ") + << "detailed stat logging, with " << sleep_sec + << "s sleep between inferences"; + + Stat stat; + + bool util_max_time = (num_runs <= 0); + for (int i = 0; util_max_time || i < num_runs; ++i) { + int64_t inference_time_us = 0; + bool s = RunInference(engine, input_infos, output_infos, summarizer, &inference_time_us); + stat.UpdateStat(inference_time_us); + (*total_time_us) += inference_time_us; + ++(*actual_num_runs); + + if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) { + break; + } + + if (!s) { + LOG(INFO) << "Failed on run " << i; + return s; + } + + if (sleep_sec > 0) { + std::this_thread::sleep_for(std::chrono::seconds(sleep_sec)); + } + } + + std::stringstream stream; + stat.OutputToStream(&stream); + LOG(INFO) << stream.str(); + + return true; +} + +DEFINE_string(device, "CPU", "Device [CPU|OPENCL]"); +DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); +DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); +DEFINE_string(input_shape, "", "input shape, separated by colon and comma"); +DEFINE_string(output_shape, "", "output shape, separated by colon and comma"); +DEFINE_string(input_file, "", "input file name"); +DEFINE_int32(max_num_runs, 100, "number of runs max"); +DEFINE_string(max_time, "10.0", "length to run max"); +DEFINE_string(inference_delay, "-1", "delay between runs in seconds"); +DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds"); +DEFINE_string(benchmark_name, "", "benchmark name"); +DEFINE_bool(show_run_order, true, "whether to list stats by run order"); +DEFINE_int32(run_order_limit, 0, "how many items to show by run order"); +DEFINE_bool(show_time, true, "whether to list stats by time taken"); +DEFINE_int32(time_limit, 10, "how many items to show by time taken"); +DEFINE_bool(show_memory, false, "whether to list stats by memory used"); +DEFINE_int32(memory_limit, 10, "how many items to show by memory used"); +DEFINE_bool(show_type, true, "whether to list stats by op type"); +DEFINE_bool(show_summary, true, "whether to show a summary of the stats"); +DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs"); +DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); +DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); + +int Main(int argc, char **argv) { + MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP."); + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]"; + LOG(INFO) << "Device: [" << FLAGS_device << "]"; + LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]"; + LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]"; + LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]"; + LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]"; + LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]"; + LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]"; + LOG(INFO) << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]"; + LOG(INFO) << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]"; + + const long int inter_inference_sleep_seconds = + std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10); + const long int inter_benchmark_sleep_seconds = + std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10); + const double max_benchmark_time_seconds = + std::strtod(FLAGS_max_time.c_str(), nullptr); + + std::unique_ptr stats; + + StatSummarizerOptions stats_options; + stats_options.show_run_order = FLAGS_show_run_order; + stats_options.run_order_limit = FLAGS_run_order_limit; + stats_options.show_time = FLAGS_show_time; + stats_options.time_limit = FLAGS_time_limit; + stats_options.show_memory = FLAGS_show_memory; + stats_options.memory_limit = FLAGS_memory_limit; + stats_options.show_type = FLAGS_show_type; + stats_options.show_summary = FLAGS_show_summary; + stats.reset(new StatSummarizer(stats_options)); + + DeviceType device_type = CPU; + if(FLAGS_device == "OPENCL") { + device_type = OPENCL; + } + + std::vector input_names = str_util::Split(FLAGS_input_node, ','); + std::vector output_names = str_util::Split(FLAGS_output_node, ','); + std::vector input_shapes = str_util::Split(FLAGS_input_shape, ':'); + std::vector output_shapes = str_util::Split(FLAGS_output_shape, ':'); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + + std::vector input_infos(input_count); + std::map output_infos; + std::vector> input_datas(input_count); + std::vector> output_datas(output_count); + + for (size_t i = 0; i < input_count; ++i) { + int64_t input_size = std::accumulate(input_shape_vec[i].begin(), + input_shape_vec[i].end(), 1, std::multiplies()); + input_datas[i].reset(new float[input_size]); + // load input + std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_datas[i].get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + input_infos[i].name = input_names[i]; + input_infos[i].shape = input_shape_vec[i]; + input_infos[i].data = input_datas[i].get(); + } + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = std::accumulate(output_shape_vec[i].begin(), + output_shape_vec[i].end(), 1, std::multiplies()); + output_datas[i].reset(new float[output_size]); + output_infos[output_names[i]] = output_datas[i].get(); + } + + // Init model + LOG(INFO) << "Run init"; + std::unique_ptr engine_ptr; + if (input_count == 1 && output_count == 1) { + engine_ptr.reset(new mace::MaceEngine(&net_def, device_type)); + } else { + engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, input_names, output_names)); + } + if (device_type == DeviceType::OPENCL) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + + LOG(INFO) << "Warm up"; + + int64_t warmup_time_us = 0; + int64_t num_warmup_runs = 0; + if (FLAGS_warmup_runs > 0) { + bool status = + Run(engine_ptr.get(), input_infos, output_infos, nullptr, FLAGS_warmup_runs, -1.0, + inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs); + if (!status) { + LOG(ERROR) << "Failed at warm up run"; + } + } + + if (inter_benchmark_sleep_seconds > 0) { + std::this_thread::sleep_for( + std::chrono::seconds(inter_benchmark_sleep_seconds)); + } + int64_t no_stat_time_us = 0; + int64_t no_stat_runs = 0; + bool status = + Run(engine_ptr.get(), input_infos, output_infos, + nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds, + inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal no-stat run"; + } + + int64_t stat_time_us = 0; + int64_t stat_runs = 0; + status = Run(engine_ptr.get(), input_infos, output_infos, + stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds, + inter_inference_sleep_seconds, &stat_time_us, &stat_runs); + if (!status) { + LOG(ERROR) << "Failed at normal stat run"; + } + + LOG(INFO) << "Average inference timings in us: " + << "Warmup: " + << (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", " + << "no stats: " << no_stat_time_us / no_stat_runs << ", " + << "with stats: " << stat_time_us / stat_runs; + + stats->PrintOperatorStats(); + + return 0; +} + +} // namespace benchmark +} // namespace mace + +int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); } diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..48e622cb7b79b026b89ad6eb1fb4349056b2d9e8 --- /dev/null +++ b/mace/benchmark/model_throughput_test.cc @@ -0,0 +1,281 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * throughput_test \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --cpu_model_data_file=cpu_model_data.data \ + * --gpu_model_data_file=gpu_model_data.data \ + * --dsp_model_data_file=dsp_model_data.data \ + * --run_seconds=10 + */ +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +using namespace std; +using namespace mace; + +namespace mace { + +#ifdef MACE_CPU_MODEL_TAG +namespace MACE_CPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_CPU_MODEL_TAG +#endif + +#ifdef MACE_GPU_MODEL_TAG +namespace MACE_GPU_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_GPU_MODEL_TAG +#endif + +#ifdef MACE_DSP_MODEL_TAG +namespace MACE_DSP_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_DSP_MODEL_TAG +#endif + +} // namespace mace + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +DeviceType ParseDeviceType(const string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + +DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma"); +DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma"); +DEFINE_string(input_file, "", "input file name"); +DEFINE_string(cpu_model_data_file, "", "cpu model data file name"); +DEFINE_string(gpu_model_data_file, "", "gpu model data file name"); +DEFINE_string(dsp_model_data_file, "", "dsp model data file name"); +DEFINE_int32(run_seconds, 10, "run seconds"); + +int main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "mace git version: " << MaceGitVersion(); +#ifdef MACE_CPU_MODEL_TAG + LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_GPU_MODEL_TAG + LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum(); +#endif +#ifdef MACE_DSP_MODEL_TAG + LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum(); +#endif + LOG(INFO) << "input_shape: " << FLAGS_input_shape; + LOG(INFO) << "output_shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file; + LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file; + LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file; + LOG(INFO) << "run_seconds: " << FLAGS_run_seconds; + + vector input_shape_vec; + vector output_shape_vec; + ParseShape(FLAGS_input_shape, &input_shape_vec); + ParseShape(FLAGS_output_shape, &output_shape_vec); + + int64_t input_size = + std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1, + std::multiplies()); + int64_t output_size = + std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1, + std::multiplies()); + std::unique_ptr input_data(new float[input_size]); + std::unique_ptr cpu_output_data(new float[output_size]); + std::unique_ptr gpu_output_data(new float[output_size]); + std::unique_ptr dsp_output_data(new float[output_size]); + + // load input + ifstream in_file(FLAGS_input_file, ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_data.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + int64_t t0, t1, init_micros; +#ifdef MACE_CPU_MODEL_TAG + /* --------------------- CPU init ----------------------- */ + LOG(INFO) << "Load & init cpu model and warm up"; + const unsigned char *cpu_model_data = + mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str()); + NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data); + + mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU); + + LOG(INFO) << "CPU Warm up run"; + t0 = NowMicros(); + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_GPU_MODEL_TAG + /* --------------------- GPU init ----------------------- */ + LOG(INFO) << "Load & init gpu model and warm up"; + const unsigned char *gpu_model_data = + mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data); + + mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL); + mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data); + + LOG(INFO) << "GPU Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us"; +#endif + +#ifdef MACE_DSP_MODEL_TAG + /* --------------------- DSP init ----------------------- */ + LOG(INFO) << "Load & init dsp model and warm up"; + const unsigned char *dsp_model_data = + mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str()); + NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data); + + mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON); + mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data); + + LOG(INFO) << "DSP Warm up run"; + t0 = NowMicros(); + gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us"; +#endif + + double cpu_throughput = 0; + double gpu_throughput = 0; + double dsp_throughput = 0; + int64_t run_micros = FLAGS_run_seconds * 1000000; + +#ifdef MACE_CPU_MODEL_TAG + std::thread cpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + cpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_GPU_MODEL_TAG + std::thread gpu_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + gpu_throughput = frames * 1000000.0 / micros; + }); +#endif + +#ifdef MACE_DSP_MODEL_TAG + std::thread dsp_thread([&]() { + int64_t frames = 0; + int64_t micros = 0; + int64_t start = NowMicros(); + for (; micros < run_micros; ++frames) { + dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get()); + int64_t end = NowMicros(); + micros = end - start; + } + dsp_throughput = frames * 1000000.0 / micros; + }); +#endif + + double total_throughput = 0; + +#ifdef MACE_CPU_MODEL_TAG + cpu_thread.join(); + LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s"; + total_throughput += cpu_throughput; +#endif +#ifdef MACE_GPU_MODEL_TAG + gpu_thread.join(); + LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s"; + total_throughput += gpu_throughput; +#endif +#ifdef MACE_DSP_MODEL_TAG + dsp_thread.join(); + LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s"; + total_throughput += dsp_throughput; +#endif + + LOG(INFO) << "Total throughput: " << total_throughput << " f/s"; +} diff --git a/mace/benchmark/stat_summarizer.cc b/mace/benchmark/stat_summarizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..68801fb26aef120b74a23cf561056f9f57398c6f --- /dev/null +++ b/mace/benchmark/stat_summarizer.cc @@ -0,0 +1,320 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "benchmark/stat_summarizer.h" +#include "mace/public/mace.h" +#include "mace/utils/logging.h" + +#include +#include +#include + +namespace mace { + +StatSummarizer::StatSummarizer(const StatSummarizerOptions &options) + : options_(options) {} + +StatSummarizer::~StatSummarizer() {} + +void StatSummarizer::Reset() { + run_total_us_.Reset(); + memory_.Reset(); + details_.clear(); +} + +void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) { + int64_t curr_total_us = 0; + int64_t mem_total = 0; + + if (run_metadata.op_stats.empty()) { + std::cerr << "Runtime op stats should not be empty" << std::endl; + abort(); + } + int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros; + + int node_num = 0; + for (const auto &ops : run_metadata.op_stats) { + std::string name = ops.operator_name; + std::string op_type = ops.type; + + ++node_num; + const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros; + curr_total_us += curr_time; + auto result = details_.emplace(name, Detail()); + Detail *detail = &(result.first->second); + + detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us); + detail->rel_end_us.UpdateStat(curr_time); + + // If this is the first pass, initialize some values. + if (result.second) { + detail->name = name; + detail->type = op_type; + + detail->run_order = node_num; + + detail->times_called = 0; + } + + ++detail->times_called; + } + + run_total_us_.UpdateStat(curr_total_us); + memory_.UpdateStat(mem_total); +} + +std::string StatSummarizer::ShortSummary() const { + std::stringstream stream; + stream << "Timings (microseconds): "; + run_total_us_.OutputToStream(&stream); + stream << std::endl; + + stream << "Memory (bytes): "; + memory_.OutputToStream(&stream); + stream << std::endl; + + stream << details_.size() << " nodes observed" << std::endl; + return stream.str(); +} + +std::ostream &InitField(std::ostream &stream, int width) { + stream << "\t" << std::right << std::setw(width) << std::fixed + << std::setprecision(3); + return stream; +} + +std::string StatSummarizer::HeaderString(const std::string &title) const { + std::stringstream stream; + + stream << "============================== " << title + << " ==============================" << std::endl; + + InitField(stream, 14) << "[node type]"; + InitField(stream, 9) << "[start]"; + InitField(stream, 9) << "[first]"; + InitField(stream, 9) << "[avg ms]"; + InitField(stream, 8) << "[%]"; + InitField(stream, 8) << "[cdf%]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 9) << "[times called]"; + stream << "\t" + << "[Name]"; + return stream.str(); +} + +std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail, + const int64_t cumulative_stat_on_node, + const Stat &stat) const { + const double start_ms = detail.start_us.avg() / 1000.0; + const double first_time_ms = detail.rel_end_us.first() / 1000.0; + const double avg_time_ms = detail.rel_end_us.avg() / 1000.0; + const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum(); + const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum(); + const int64_t times_called = detail.times_called / num_runs(); + + std::stringstream stream; + InitField(stream, 14) << detail.type; + InitField(stream, 9) << start_ms; + InitField(stream, 9) << first_time_ms; + InitField(stream, 9) << avg_time_ms; + InitField(stream, 7) << percentage << "%"; + InitField(stream, 7) << cdf_percentage << "%"; + InitField(stream, 10) << detail.mem_used.newest() / 1000.0; + InitField(stream, 9) << times_called; + stream << "\t" << detail.name; + + return stream.str(); +} + +void StatSummarizer::OrderNodesByMetric( + SortingMetric metric, std::vector *details) const { + std::priority_queue> sorted_list; + const int num_nodes = details_.size(); + + for (const auto &det : details_) { + const Detail *detail = &(det.second); + std::stringstream stream; + stream << std::setw(20) << std::right << std::setprecision(10) + << std::fixed; + + switch (metric) { + case BY_NAME: + stream << detail->name; + break; + case BY_RUN_ORDER: + stream << num_nodes - detail->run_order; + break; + case BY_TIME: + stream << detail->rel_end_us.avg(); + break; + case BY_MEMORY: + stream << detail->mem_used.avg(); + break; + case BY_TYPE: + stream << detail->type; + break; + default: + stream << ""; + break; + } + + sorted_list.emplace(stream.str(), detail); + } + + while (!sorted_list.empty()) { + auto entry = sorted_list.top(); + sorted_list.pop(); + details->push_back(entry.second); + } +} + +void StatSummarizer::ComputeStatsByType( + std::map *node_type_map_count, + std::map *node_type_map_time, + std::map *node_type_map_memory, + std::map *node_type_map_times_called, + int64_t *accumulated_us) const { + int64_t run_count = run_total_us_.count(); + + for (const auto &det : details_) { + const std::string node_name = det.first; + const Detail &detail = det.second; + + int64_t curr_time_val = + static_cast(detail.rel_end_us.sum() / run_count); + *accumulated_us += curr_time_val; + + int64_t curr_memory_val = detail.mem_used.newest(); + + const std::string &node_type = detail.type; + + (*node_type_map_count)[node_type] += 1; + (*node_type_map_time)[node_type] += curr_time_val; + (*node_type_map_memory)[node_type] += curr_memory_val; + (*node_type_map_times_called)[node_type] += detail.times_called / run_count; + } +} + +std::string StatSummarizer::GetStatsByNodeType() const { + std::stringstream stream; + + stream << "============================== Summary by node type " + "==============================" + << std::endl; + + LOG(INFO) << "Number of nodes executed: " << details_.size() << std::endl; + + std::map node_type_map_count; + std::map node_type_map_time; + std::map node_type_map_memory; + std::map node_type_map_times_called; + int64_t accumulated_us = 0; + + ComputeStatsByType(&node_type_map_count, &node_type_map_time, + &node_type_map_memory, &node_type_map_times_called, + &accumulated_us); + + // Sort them. + std::priority_queue>> + timings; + for (const auto &node_type : node_type_map_time) { + const int64_t mem_used = node_type_map_memory[node_type.first]; + timings.emplace(node_type.second, + std::pair(node_type.first, mem_used)); + } + + InitField(stream, 14) << "[Node type]"; + InitField(stream, 9) << "[count]"; + InitField(stream, 10) << "[avg ms]"; + InitField(stream, 11) << "[avg %]"; + InitField(stream, 11) << "[cdf %]"; + InitField(stream, 10) << "[mem KB]"; + InitField(stream, 10) << "[times called]"; + stream << std::endl; + + float cdf = 0.0f; + while (!timings.empty()) { + auto entry = timings.top(); + timings.pop(); + + const std::string node_type = entry.second.first; + const float memory = entry.second.second / 1000.0f; + + const int64_t node_type_total_us = entry.first; + const float time_per_run_ms = node_type_total_us / 1000.0f; + + const float percentage = + ((entry.first / static_cast(accumulated_us)) * 100.0f); + cdf += percentage; + + InitField(stream, 14) << node_type; + InitField(stream, 9) << node_type_map_count[node_type]; + InitField(stream, 10) << time_per_run_ms; + InitField(stream, 10) << percentage << "%"; + InitField(stream, 10) << cdf << "%"; + InitField(stream, 10) << memory; + InitField(stream, 9) << node_type_map_times_called[node_type]; + stream << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetStatsByMetric(const std::string &title, + SortingMetric sorting_metric, + int num_stats) const { + std::vector details; + OrderNodesByMetric(sorting_metric, &details); + + double cumulative_stat_on_node = 0; + + std::stringstream stream; + stream << HeaderString(title) << std::endl; + int stat_num = 0; + for (auto detail : details) { + ++stat_num; + if (num_stats > 0 && stat_num > num_stats) { + break; + } + + cumulative_stat_on_node += detail->rel_end_us.sum(); + stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_) + << std::endl; + } + stream << std::endl; + return stream.str(); +} + +std::string StatSummarizer::GetOutputString() const { + std::stringstream stream; + if (options_.show_run_order) { + stream << GetStatsByMetric("Run Order", BY_RUN_ORDER, + options_.run_order_limit); + } + if (options_.show_time) { + stream << GetStatsByMetric("Top by Computation Time", BY_TIME, + options_.time_limit); + } + if (options_.show_memory) { + stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY, + options_.memory_limit); + } + if (options_.show_type) { + stream << GetStatsByNodeType(); + } + if (options_.show_summary) { + stream << ShortSummary() << std::endl; + } + return stream.str(); +} + +void StatSummarizer::PrintOperatorStats() const { + std::string output = GetOutputString(); + std::istringstream iss(output); + for (std::string line; std::getline(iss, line);) { + LOG(INFO) << line; + } +} + +} // namespace mace diff --git a/mace/benchmark/stat_summarizer.h b/mace/benchmark/stat_summarizer.h new file mode 100644 index 0000000000000000000000000000000000000000..368b1e30299b77467b847396eeb20df586895618 --- /dev/null +++ b/mace/benchmark/stat_summarizer.h @@ -0,0 +1,199 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ +#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace mace { + +class RunMetadata; + +template +class Stat { + public: + void UpdateStat(ValueType v) { + if (count_ == 0) { + first_ = v; + } + + newest_ = v; + max_ = std::max(v, max_); + min_ = std::min(v, min_); + ++count_; + sum_ += v; + squared_sum_ += static_cast(v) * v; + } + + void Reset() { new (this) Stat(); } + + bool empty() const { return count_ == 0; } + + ValueType first() const { return first_; } + + ValueType newest() const { return newest_; } + + ValueType max() const { return max_; } + + ValueType min() const { return min_; } + + int64_t count() const { return count_; } + + ValueType sum() const { return sum_; } + + HighPrecisionValueType squared_sum() const { return squared_sum_; } + + bool all_same() const { return (count_ == 0 || min_ == max_); } + + HighPrecisionValueType avg() const { + return empty() ? std::numeric_limits::quiet_NaN() + : static_cast(sum_) / count_; + } + + ValueType std_deviation() const { + return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg()); + } + + void OutputToStream(std::ostream *stream) const { + if (empty()) { + *stream << "count=0"; + } else if (all_same()) { + *stream << "count=" << count_ << " curr=" << newest_; + if (count_ > 1) *stream << "(all same)"; + } else { + *stream << "count=" << count_ << " first=" << first_ + << " curr=" << newest_ << " min=" << min_ << " max=" << max_ + << " avg=" << avg() << " std=" << std_deviation(); + } + } + + friend std::ostream &operator<<(std::ostream &stream, + const Stat &stat) { + stat.OutputToStream(&stream); + return stream; + } + + private: + ValueType first_ = 0; + ValueType newest_ = 0; + ValueType max_ = std::numeric_limits::min(); + ValueType min_ = std::numeric_limits::max(); + int64_t count_ = 0; + ValueType sum_ = 0; + HighPrecisionValueType squared_sum_ = 0; +}; + +// Used to control the output of the statistics summarizer; +class StatSummarizerOptions { + public: + StatSummarizerOptions() + : show_run_order(true), + run_order_limit(0), + show_time(true), + time_limit(10), + show_memory(true), + memory_limit(10), + show_type(true), + show_summary(true) {} + + bool show_run_order; + int run_order_limit; + bool show_time; + int time_limit; + bool show_memory; + int memory_limit; + bool show_type; + bool show_summary; +}; + +// A StatSummarizer assists in performance analysis of Graph executions. +// +// It summarizes time spent executing (on GPU/CPU), memory used etc. across +// multiple executions of a single Graph from the StepStats collected during +// graph execution. +// +// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage. +class StatSummarizer { + public: + enum SortingMetric { + BY_NAME, + BY_RUN_ORDER, + BY_TIME, + BY_MEMORY, + BY_TYPE, + }; + + explicit StatSummarizer(const StatSummarizerOptions &options); + + ~StatSummarizer(); + + // Adds another run's StepStats output to the aggregate counts. + void ProcessMetadata(const RunMetadata &run_metadata); + + // Returns a string detailing the accumulated runtime stats in a tab-separated + // format which can be pasted into a spreadsheet for further analysis. + std::string GetOutputString() const; + + std::string ShortSummary() const; + + // Prints the string returned by GetOutputString(). + void PrintOperatorStats() const; + + void ComputeStatsByType( + std::map *node_type_map_count, + std::map *node_type_map_time, + std::map *node_type_map_memory, + std::map *node_type_map_times_called, + int64_t *accumulated_us) const; + + std::string GetStatsByNodeType() const; + + std::string GetStatsByMetric(const std::string &title, + SortingMetric sorting_metric, + int num_stats) const; + + void Reset(); + + // Returns number of runs. + int num_runs() const { return run_total_us_.count(); } + + // Returns stats of total microseconds spent by all nodes in each run. + const Stat &run_total_us() const { return run_total_us_; } + + private: + struct Detail { + std::string name; + std::string type; + int64_t run_order; + Stat start_us; + Stat rel_end_us; + Stat mem_used; + int64_t times_called; + }; + + void OrderNodesByMetric(SortingMetric sorting_metric, + std::vector *details) const; + + std::string HeaderString(const std::string &title) const; + std::string ColumnString(const Detail &detail, + const int64_t cumulative_stat_on_node, + const Stat &stat) const; + + Stat run_total_us_; + Stat memory_; + + std::map details_; + StatSummarizerOptions options_; +}; + +} // namespace mace + +#endif // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_ diff --git a/mace/core/BUILD b/mace/core/BUILD index ad32a030ce300be1d37e85e74ec6b60f648b0851..962f8423a88364d807b3f404044632db70300572 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -104,6 +104,7 @@ cc_library( deps = [ ":opencl_headers", "//mace/codegen:generated_opencl_prod", + "//mace/codegen:generated_tuning_params", "//mace/utils", ], ) diff --git a/mace/examples/BUILD b/mace/examples/BUILD index ff47e1d98fcca18a96cee0c9a69bd5a7101554db..af806dcc06688984b7df812f1fe9dc9e77ed93d3 100644 --- a/mace/examples/BUILD +++ b/mace/examples/BUILD @@ -24,3 +24,14 @@ cc_test( "//mace/core:test_benchmark_main", ], ) + +cc_binary( + name = "mace_run", + srcs = ["mace_run.cc"], + linkopts = if_openmp_enabled(["-fopenmp"]), + linkstatic = 1, + deps = [ + "//mace/codegen:generated_models", + "//external:gflags_nothreads", + ], +) diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c188c274b8fddb3fddb9c6007d2245a3cdebd06 --- /dev/null +++ b/mace/examples/mace_run.cc @@ -0,0 +1,407 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +/** + * Usage: + * mace_run --model=mobi_mace.pb \ + * --input=input_node \ + * --output=output_node \ + * --input_shape=1,224,224,3 \ + * --output_shape=1,224,224,2 \ + * --input_file=input_data \ + * --output_file=mace.out \ + * --model_data_file=model_data.data \ + * --device=OPENCL + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +using namespace std; +using namespace mace; + +namespace mace { +namespace MACE_MODEL_TAG { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace MACE_MODEL_TAG +} // namespace mace + + +namespace str_util { + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} + +} // namespace str_util + +void ParseShape(const string &str, vector *shape) { + string tmp = str; + while (!tmp.empty()) { + int dim = atoi(tmp.data()); + shape->push_back(dim); + size_t next_offset = tmp.find(","); + if (next_offset == string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } +} + +std::string FormatName(const std::string input) { + std::string res = input; + for (size_t i = 0; i < input.size(); ++i) { + if (!isalnum(res[i])) res[i] = '_'; + } + return res; +} + +DeviceType ParseDeviceType(const string &device_str) { + if (device_str.compare("CPU") == 0) { + return DeviceType::CPU; + } else if (device_str.compare("NEON") == 0) { + return DeviceType::NEON; + } else if (device_str.compare("OPENCL") == 0) { + return DeviceType::OPENCL; + } else if (device_str.compare("HEXAGON") == 0) { + return DeviceType::HEXAGON; + } else { + return DeviceType::CPU; + } +} + +struct mallinfo LogMallinfoChange(struct mallinfo prev) { + struct mallinfo curr = mallinfo(); + if (prev.arena != curr.arena) { + LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena + << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena); + } + if (prev.ordblks != curr.ordblks) { + LOG(INFO) << "Number of free chunks: " << curr.ordblks + << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks); + } + if (prev.smblks != curr.smblks) { + LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks + << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks); + } + if (prev.hblks != curr.hblks) { + LOG(INFO) << "Number of mmapped regions: " << curr.hblks + << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks); + } + if (prev.hblkhd != curr.hblkhd) { + LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd + << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd); + } + if (prev.usmblks != curr.usmblks) { + LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks + << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks); + } + if (prev.fsmblks != curr.fsmblks) { + LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks + << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks); + } + if (prev.uordblks != curr.uordblks) { + LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks + << ", diff: " + << ((int64_t)curr.uordblks - (int64_t)prev.uordblks); + } + if (prev.fordblks != curr.fordblks) { + LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: " + << ((int64_t)curr.fordblks - (int64_t)prev.fordblks); + } + if (prev.keepcost != curr.keepcost) { + LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost + << ", diff: " + << ((int64_t)curr.keepcost - (int64_t)prev.keepcost); + } + return curr; +} + +DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); +DEFINE_string(input_shape, "1,224,224,3:1,1,1,10", "input shapes, separated by colon and comma"); +DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); +DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); +DEFINE_string(output_file, "", "output file name | output file prefix for multiple outputs"); +DEFINE_string(model_data_file, "", + "model data file name, used when EMBED_MODEL_DATA set to 0"); +DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); +DEFINE_int32(round, 1, "round"); +DEFINE_int32(restart_round, 1, "restart round"); +DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); + +bool SingleInputAndOutput(const std::vector &input_shape, + const std::vector &output_shape) { + // load model + int64_t t0 = NowMicros(); + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + int64_t t1 = NowMicros(); + LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us"; + int64_t init_micros = t1 - t0; + + DeviceType device_type = ParseDeviceType(FLAGS_device); + LOG(INFO) << "Runing with device type: " << device_type; + + // Init model + LOG(INFO) << "Run init"; + t0 = NowMicros(); + mace::MaceEngine engine(&net_def, device_type); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + t1 = NowMicros(); + init_micros += t1 - t0; + LOG(INFO) << "Net init latency: " << t1 - t0 << " us"; + LOG(INFO) << "Total init latency: " << init_micros << " us"; + + // Allocate input and output + int64_t input_size = + std::accumulate(input_shape.begin(), input_shape.end(), 1, + std::multiplies()); + int64_t output_size = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + std::unique_ptr input_data(new float[input_size]); + std::unique_ptr output_data(new float[output_size]); + + // load input + ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node), ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_data.get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + + LOG(INFO) << "Warm up run"; + t0 = NowMicros(); + engine.Run(input_data.get(), input_shape, output_data.get()); + t1 = NowMicros(); + LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; + + if (FLAGS_round > 0) { + LOG(INFO) << "Run model"; + t0 = NowMicros(); + struct mallinfo prev = mallinfo(); + for (int i = 0; i < FLAGS_round; ++i) { + engine.Run(input_data.get(), input_shape, output_data.get()); + if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) { + LOG(INFO) << "=== check malloc info change #" << i << " ==="; + prev = LogMallinfoChange(prev); + } + } + t1 = NowMicros(); + LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us"; + } + + if (FLAGS_restart_round == 1) { + if (output_data != nullptr) { + std::string + output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node); + ofstream out_file(output_name, ios::binary); + out_file.write((const char *) (output_data.get()), + output_size * sizeof(float)); + out_file.flush(); + out_file.close(); + LOG(INFO) << "Write output file " + << output_name + << " with size " << output_size + << " done."; + } else { + LOG(INFO) << "Output data is null"; + } + } + + return true; +} + +bool MultipleInputOrOutput(const std::vector &input_names, + const std::vector> &input_shapes, + const std::vector &output_names, + const std::vector> &output_shapes) { + // load model + int64_t t0 = NowMicros(); + const unsigned char *model_data = + mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); + NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + int64_t t1 = NowMicros(); + LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us"; + int64_t init_micros = t1 - t0; + + DeviceType device_type = ParseDeviceType(FLAGS_device); + LOG(INFO) << "Runing with device type: " << device_type; + + // Init model + LOG(INFO) << "Run init"; + t0 = NowMicros(); + mace::MaceEngine engine(&net_def, device_type, input_names, output_names); + if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { + mace::MACE_MODEL_TAG::UnloadModelData(model_data); + } + t1 = NowMicros(); + init_micros += t1 - t0; + LOG(INFO) << "Net init latency: " << t1 - t0 << " us"; + LOG(INFO) << "Total init latency: " << init_micros << " us"; + + const size_t input_count = input_names.size(); + const size_t output_count = output_names.size(); + std::vector input_infos(input_count); + std::map outputs; + std::vector> input_datas(input_count); + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::multiplies()); + input_datas[i].reset(new float[input_size]); + // load input + ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), ios::in | ios::binary); + if (in_file.is_open()) { + in_file.read(reinterpret_cast(input_datas[i].get()), + input_size * sizeof(float)); + in_file.close(); + } else { + LOG(INFO) << "Open input file failed"; + return -1; + } + input_infos[i].name = input_names[i]; + input_infos[i].shape = input_shapes[i]; + input_infos[i].data = input_datas[i].get(); + } + std::vector> output_datas(output_count); + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + output_datas[i].reset(new float[output_size]); + outputs[output_names[i]] = output_datas[i].get(); + } + + LOG(INFO) << "Warm up run"; + t0 = NowMicros(); + engine.Run(input_infos, outputs); + t1 = NowMicros(); + LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us"; + + if (FLAGS_round > 0) { + LOG(INFO) << "Run model"; + t0 = NowMicros(); + struct mallinfo prev = mallinfo(); + for (int i = 0; i < FLAGS_round; ++i) { + engine.Run(input_infos, outputs); + if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) { + LOG(INFO) << "=== check malloc info change #" << i << " ==="; + prev = LogMallinfoChange(prev); + } + } + t1 = NowMicros(); + LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us"; + } + + for (size_t i = 0; i < output_count; ++i) { + std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]); + ofstream out_file(output_name, ios::binary); + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + out_file.write((const char *) outputs[output_names[i]], + output_size * sizeof(float)); + out_file.flush(); + out_file.close(); + LOG(INFO) << "Write output file " + << output_name + << " with size " << output_size + << " done."; + } + + return true; +} + +int main(int argc, char **argv) { + gflags::SetUsageMessage("some usage message"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + LOG(INFO) << "mace version: " << MaceVersion(); + LOG(INFO) << "mace git version: " << MaceGitVersion(); + LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum(); + LOG(INFO) << "input node: " << FLAGS_input_node; + LOG(INFO) << "input shape: " << FLAGS_input_shape; + LOG(INFO) << "output node: " << FLAGS_output_node; + LOG(INFO) << "output shape: " << FLAGS_output_shape; + LOG(INFO) << "input_file: " << FLAGS_input_file; + LOG(INFO) << "output_file: " << FLAGS_output_file; + LOG(INFO) << "model_data_file: " << FLAGS_model_data_file; + LOG(INFO) << "device: " << FLAGS_device; + LOG(INFO) << "round: " << FLAGS_restart_round; + LOG(INFO) << "restart_round: " << FLAGS_round; + + std::vector input_names = str_util::Split(FLAGS_input_node, ','); + std::vector output_names = str_util::Split(FLAGS_output_node, ','); + std::vector input_shapes = str_util::Split(FLAGS_input_shape, ':'); + std::vector output_shapes = str_util::Split(FLAGS_output_shape, ':'); + + const size_t input_count = input_shapes.size(); + const size_t output_count = output_shapes.size(); + std::vector> input_shape_vec(input_count); + std::vector> output_shape_vec(output_count); + for (size_t i = 0; i < input_count; ++i) { + ParseShape(input_shapes[i], &input_shape_vec[i]); + } + for (size_t i = 0; i < output_count; ++i) { + ParseShape(output_shapes[i], &output_shape_vec[i]); + } + + bool ret; +#pragma omp parallel for + for (int i = 0; i < FLAGS_restart_round; ++i) { + VLOG(0) << "restart round " << i; + if (input_count == 1 && output_count == 1) { + ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]); + } else { + ret = MultipleInputOrOutput(input_names, + input_shape_vec, + output_names, + output_shape_vec); + } + } + if(ret) { + return 0; + } else { + return -1; + } +} diff --git a/mace/proto/BUILD b/mace/proto/BUILD index 8649197b94508615dd395a991bccfe5205042804..5222b06bda6e1681b15ac7f60317376c5d34fa3d 100644 --- a/mace/proto/BUILD +++ b/mace/proto/BUILD @@ -10,6 +10,15 @@ licenses(["notice"]) # Apache 2.0 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") +py_proto_library( + name = "mace_py", + srcs = ["mace.proto"], + default_runtime = "@com_google_protobuf//:protobuf_python", + protoc = "@com_google_protobuf//:protoc", + srcs_version = "PY2AND3", + deps = ["@com_google_protobuf//:protobuf_python"], +) + py_proto_library( name = "caffe_py", srcs = ["caffe.proto"], diff --git a/mace/proto/caffe.proto b/mace/proto/caffe.proto index f1f99e5eba428ab9d7159e49b7ff6256323ea719..22764abc33fda32026bf436b685d79aa18ade460 100644 --- a/mace/proto/caffe.proto +++ b/mace/proto/caffe.proto @@ -98,7 +98,7 @@ message NetParameter { // NOTE // Update the next available ID when you add a new SolverParameter field. // -// SolverParameter next available ID: 41 (last added: type) +// SolverParameter next available ID: 43 (last added: weights) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks @@ -128,8 +128,7 @@ message SolverParameter { // The states for the train/test nets. Must be unspecified or // specified once per net. // - // By default, all states will have solver = true; - // train_state will have phase = TRAIN, + // By default, train_state will have phase = TRAIN, // and all test_state's will have phase = TEST. // Other defaults are set according to the NetState defaults. optional NetState train_state = 26; @@ -187,7 +186,11 @@ message SolverParameter { optional float clip_gradients = 35 [default = -1]; optional int32 snapshot = 14 [default = 0]; // The snapshot interval - optional string snapshot_prefix = 15; // The prefix for the snapshot. + // The prefix for the snapshot. + // If not set then is replaced by prototxt file path without extention. + // If is set to directory then is augmented by prototxt file name + // without extention. + optional string snapshot_prefix = 15; // whether to snapshot diff in the results or not. Snapshotting diff will help // debugging but the final protocol buffer size will be much larger. optional bool snapshot_diff = 16 [default = false]; @@ -219,7 +222,7 @@ message SolverParameter { // RMSProp decay value // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) - optional float rms_decay = 38; + optional float rms_decay = 38 [default = 0.99]; // If true, print information about the state of the net that may help with // debugging learning problems. @@ -239,6 +242,19 @@ message SolverParameter { } // DEPRECATED: use type instead of solver_type optional SolverType solver_type = 30 [default = SGD]; + + // Overlap compute and communication for data parallel training + optional bool layer_wise_reduce = 41 [default = true]; + + // Path to caffemodel file(s) with pretrained weights to initialize finetuning. + // Tha same as command line --weights parameter for caffe train command. + // If command line --weights parameter if specified, it has higher priority + // and owerwrites this one(s). + // If --snapshot command line parameter is specified, this one(s) are ignored. + // If several model files are expected, they can be listed in a one + // weights parameter separated by ',' (like in a command string) or + // in repeated weights parameters separately. + repeated string weights = 42; } // A message that stores the solver snapshots @@ -389,16 +405,12 @@ message LayerParameter { optional PoolingParameter pooling_param = 121; optional PowerParameter power_param = 122; optional PReLUParameter prelu_param = 131; - optional PSROIPoolingParameter psroi_pooling_param = 149; - optional PSROIAlignParameter psroi_align_param = 1490; optional PythonParameter python_param = 130; optional RecurrentParameter recurrent_param = 146; optional ReductionParameter reduction_param = 136; optional ReLUParameter relu_param = 123; optional ReshapeParameter reshape_param = 133; - optional ROIPoolingParameter roi_pooling_param = 8266711; optional ScaleParameter scale_param = 142; - optional ProposalParameter proposal_param = 8266713; optional SigmoidParameter sigmoid_param = 124; optional SoftmaxParameter softmax_param = 125; optional SPPParameter spp_param = 132; @@ -407,8 +419,6 @@ message LayerParameter { optional ThresholdParameter threshold_param = 128; optional TileParameter tile_param = 138; optional WindowDataParameter window_data_param = 129; - - optional NNPACKConvolutionParameter nnpack_convolution_param = 204; } // Message that stores parameters used to apply transformation @@ -424,7 +434,7 @@ message TransformationParameter { optional uint32 crop_size = 3 [default = 0]; // mean_file and mean_value cannot be specified at the same time optional string mean_file = 4; - // if specified can be repeated once (would substract it from all the channels) + // if specified can be repeated once (would subtract it from all the channels) // or can be repeated the same number of times as channels // (would subtract them from the corresponding channel) repeated float mean_value = 5; @@ -440,7 +450,7 @@ message LossParameter { optional int32 ignore_label = 1; // How to normalize the loss for loss layers that aggregate across batches, // spatial dimensions, or other dimensions. Currently only implemented in - // SoftmaxWithLoss layer. + // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers. enum NormalizationMode { // Divide by the number of examples in the batch times spatial dimensions. // Outputs that receive the ignore label will NOT be ignored in computing @@ -454,6 +464,8 @@ message LossParameter { // Do not normalize the loss. NONE = 3; } + // For historical reasons, the default normalization for + // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID. optional NormalizationMode normalization = 3 [default = VALID]; // Deprecated. Ignored if normalization is specified. If normalization // is not specified, then setting this to false will be equivalent to @@ -504,11 +516,21 @@ message ConcatParameter { } message BatchNormParameter { - // If false, accumulate global mean/variance values via a moving average. If - // true, use those accumulated values instead of computing mean/variance - // across the batch. + // If false, normalization is performed over the current mini-batch + // and global statistics are accumulated (but not yet used) by a moving + // average. + // If true, those accumulated mean and variance values are used for the + // normalization. + // By default, it is set to false when the network is in the training + // phase and true when the network is in the testing phase. optional bool use_global_stats = 1; - // How much does the moving average decay each iteration? + // What fraction of the moving average remains each iteration? + // Smaller values make the moving average decay faster, giving more + // weight to the recent values. + // Each iteration updates the moving average @f$S_{t-1}@f$ with the + // current mean @f$ Y_t @f$ by + // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$ + // is the moving_average_fraction parameter. optional float moving_average_fraction = 2 [default = .999]; // Small value to add to the variance estimate so that we don't divide by // zero. @@ -590,7 +612,6 @@ message ConvolutionParameter { DEFAULT = 0; CAFFE = 1; CUDNN = 2; - NNPACK = 3; } optional Engine engine = 15 [default = DEFAULT]; @@ -660,8 +681,8 @@ message DataParameter { optional bool mirror = 6 [default = false]; // Force the encoded image to have 3 color channels optional bool force_encoded_color = 9 [default = false]; - // Prefetch queue (Number of batches to prefetch to host memory, increase if - // data access bandwidth varies). + // Prefetch queue (Increase if data feeding bandwidth varies, within the + // limit of device memory for GPU training) optional uint32 prefetch = 10 [default = 4]; } @@ -808,6 +829,7 @@ message ImageDataParameter { message InfogainLossParameter { // Specify the infogain matrix source. optional string source = 1; + optional int32 axis = 2 [default = 1]; // axis of prob } message InnerProductParameter { @@ -825,13 +847,6 @@ message InnerProductParameter { // of the weight matrix. The weight matrix itself is not going to be transposed // but rather the transfer flag of operations will be toggled accordingly. optional bool transpose = 6 [default = false]; - - enum Engine { - DEFAULT = 0; - CAFFE = 1; - NNPACK = 2; - } - optional Engine engine = 7 [default = DEFAULT]; } message InputParameter { @@ -915,7 +930,6 @@ message PoolingParameter { DEFAULT = 0; CAFFE = 1; CUDNN = 2; - NNPACK = 3; } optional Engine engine = 11 [default = DEFAULT]; // If global_pooling then it will pool over the size of the bottom by doing @@ -930,17 +944,6 @@ message PowerParameter { optional float shift = 3 [default = 0.0]; } -message PSROIPoolingParameter { - required float spatial_scale = 1; - required int32 output_dim = 2; // output channel number - required int32 group_size = 3; // number of groups to encode position-sensitive score maps -} -message PSROIAlignParameter { - required float spatial_scale = 1; - required int32 output_dim = 2; // output channel number - required int32 group_size = 3; // number of groups to encode position-sensitive score maps -} - message PythonParameter { optional string module = 1; optional string layer = 2; @@ -949,9 +952,7 @@ message PythonParameter { // string, dictionary in Python dict format, JSON, etc. You may parse this // string in `setup` method and use it in `forward` and `backward`. optional string param_str = 3 [default = '']; - // Whether this PythonLayer is shared among worker solvers during data parallelism. - // If true, each worker solver sequentially run forward from this layer. - // This value should be set true if you are using it as a data layer. + // DEPRECATED optional bool share_in_parallel = 4 [default = false]; } @@ -1083,17 +1084,6 @@ message ReshapeParameter { optional int32 num_axes = 3 [default = -1]; } -// Message that stores parameters used by ROIPoolingLayer -message ROIPoolingParameter { - // Pad, kernel size, and stride are all given as a single value for equal - // dimensions in height and width or as Y, X pairs. - optional uint32 pooled_h = 1 [default = 0]; // The pooled output height - optional uint32 pooled_w = 2 [default = 0]; // The pooled output width - // Multiplicative spatial scale factor to translate ROI coords from their - // input scale to the scale used when pooling - optional float spatial_scale = 3 [default = 1]; -} - message ScaleParameter { // The first axis of bottom[0] (the first input Blob) along which to apply // bottom[1] (the second input Blob). May be negative to index from the end @@ -1131,13 +1121,6 @@ message ScaleParameter { optional FillerParameter bias_filler = 5; } -// Message that stores parameters used by ProposalLayer -message ProposalParameter { - optional uint32 feat_stride = 1 [default = 16]; - repeated uint32 scales = 2; - repeated float ratios = 3; -} - message SigmoidParameter { enum Engine { DEFAULT = 0; @@ -1438,22 +1421,6 @@ message PReLUParameter { // Initial value of a_i. Default is a_i=0.25 for all i. optional FillerParameter filler = 1; - // Whether or not slope paramters are shared across channels. + // Whether or not slope parameters are shared across channels. optional bool channel_shared = 2 [default = false]; } - -message NNPACKConvolutionParameter { - enum Algorithm { - AUTO = 0; - WINOGRAD = 1; - FFT_16x16 = 2; - FFT_8x8 = 3; - } - optional Algorithm algorithm = 1 [default=AUTO]; - enum KernelTransformStrategy { - RECOMPUTE = 0; - REUSE = 1; - } - optional KernelTransformStrategy kernel_transform_strategy = 2 [default=RECOMPUTE]; -} - diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto new file mode 100644 index 0000000000000000000000000000000000000000..c3744edafe7adda5acd6096aec50e70880bb2d44 --- /dev/null +++ b/mace/proto/mace.proto @@ -0,0 +1,137 @@ +syntax = "proto2"; + +package mace; + +enum NetMode { + INIT = 0; + NORMAL = 1; +} + +enum DeviceType { + CPU = 0; // In default, we will use CPU. + NEON = 1; + OPENCL = 2; +} + +enum DataType { + DT_INVALID = 0; + + // Data types that all computation devices are expected to be + // capable to support. + DT_FLOAT = 1; + DT_DOUBLE = 2; + DT_INT32 = 3; + DT_UINT8 = 4; + DT_INT16 = 5; + DT_INT8 = 6; + DT_STRING = 7; + DT_INT64 = 8; + DT_UINT16 = 9; + DT_BOOL = 10; + DT_HALF = 19; + DT_UINT32 = 22; +} + +message TensorProto { + // The dimensions in the tensor. + repeated int64 dims = 1; + optional DataType data_type = 2 [default = DT_FLOAT]; + // For float + repeated float float_data = 3 [packed = true]; + // For int32, uint8, int8, uint16, int16, bool, and float16 + // Note about float16: in storage we will basically convert float16 byte-wise + // to unsigned short and then store them in the int32_data field. + repeated int32 int32_data = 4 [packed = true]; + // For bytes + optional bytes byte_data = 5; + // For strings + repeated bytes string_data = 6; + // For double + repeated double double_data = 9 [packed = true]; + // For int64 + repeated int64 int64_data = 10 [packed = true]; + // Optionally, a name for the tensor. + optional string name = 7; + + optional uint32 node_id = 100; +} + +message Argument { + optional string name = 1; + optional float f = 2; + optional int64 i = 3; + optional bytes s = 4; + repeated float floats = 5; + repeated int64 ints = 6; + repeated bytes strings = 7; +} + +// for hexagon mace-nnlib +message NodeInput { + optional int32 node_id = 1; + optional int32 output_port = 2; +} + +message OutputShape { + repeated int64 dims = 1; +} + +message OperatorDef { + repeated string input = 1; + repeated string output = 2; + optional string name = 3; + optional string type = 4; + repeated Argument arg = 5; + repeated OutputShape output_shape = 6; + repeated DataType output_type = 7; + + repeated int32 mem_id = 10; + + // for hexagon mace-nnlib + optional uint32 node_id = 100; + optional uint32 op_id = 101; + optional uint32 padding = 102; + repeated NodeInput node_input = 103; + repeated int32 out_max_byte_size = 104; // only support 32-bit len +} + +// for memory optimization +message MemoryBlock { + optional int32 mem_id = 1; + optional uint32 x = 2; + optional uint32 y = 3; +} +message MemoryArena { + repeated MemoryBlock mem_block = 1; +} + +// for hexagon mace-nnlib +message InputInfo { + optional string name = 1; + optional int32 node_id = 2; + repeated int32 dims = 3; + optional int32 max_byte_size = 4; // only support 32-bit len + optional DataType data_type = 5 [default = DT_FLOAT]; +} +message OutputInfo { + optional string name = 1; + optional int32 node_id = 2; + repeated int32 dims = 3; + optional int32 max_byte_size = 4; // only support 32-bit len + optional DataType data_type = 5 [default = DT_FLOAT]; +} + +message NetDef { + optional string name = 1; + repeated OperatorDef op = 2; + optional string version = 3; + repeated Argument arg = 4; + repeated TensorProto tensors = 5; + + // for mem optimization + optional MemoryArena mem_arena = 10; + + // for hexagon mace-nnlib + repeated InputInfo input_info = 100; + repeated OutputInfo output_info = 101; +} diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD index 4b898b54411ea02f54e588b32a81e986610cee66..08e7bca4f72def244e292db3099526b9a06687de 100644 --- a/mace/python/tools/BUILD +++ b/mace/python/tools/BUILD @@ -1,8 +1,58 @@ -py_binary( - name = "caffe_ops_stats", - srcs = ["caffe_ops_stats.py"], +py_library( + name = "tf_converter_lib", + srcs = [ + "convert_util.py", + "graph_util.py", + "tf_converter_lib.py", + "tf_dsp_converter_lib.py", + ], srcs_version = "PY2AND3", deps = [ + ":memory_optimizer", + "//mace/proto:mace_py", + ], +) + +py_library( + name = "caffe_converter_lib", + srcs = [ + "caffe_converter_lib.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":memory_optimizer", "//mace/proto:caffe_py", ], ) + +py_library( + name = "source_converter_lib", + srcs = [ + "source_converter_lib.py", + ], + srcs_version = "PY2AND3", + deps = [ + "//mace/proto:mace_py", + ], +) + +py_binary( + name = "converter", + srcs = ["converter.py"], + srcs_version = "PY2AND3", + deps = [ + ":tf_converter_lib", + ":caffe_converter_lib", + ":source_converter_lib", + "@six_archive//:six", + ], +) + +py_binary( + name = "memory_optimizer", + srcs = ["memory_optimizer.py"], + srcs_version = "PY2AND3", + deps = [ + "//mace/proto:mace_py", + ], +) diff --git a/mace/python/tools/binary_codegen.py b/mace/python/tools/binary_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..aea06a0a1da060051cdf4b97ac93058e8241f3a5 --- /dev/null +++ b/mace/python/tools/binary_codegen.py @@ -0,0 +1,89 @@ +import argparse +import os +import sys +import struct + +import jinja2 + +import numpy as np + +# python mace/python/tools/binary_codegen.py \ +# --binary_dirs=${BIN_FILE} \ +# --binary_file_name=mace_run.config \ +# --output_path=${CODE_GEN_PATH} --variable_name=kTuningParamsData + +FLAGS = None + + +def generate_cpp_source(): + data_map = {} + for binary_dir in FLAGS.binary_dirs.split(","): + binary_path = os.path.join(binary_dir, FLAGS.binary_file_name) + if not os.path.exists(binary_path): + continue + + with open(binary_path, "rb") as f: + binary_array = np.fromfile(f, dtype=np.uint8) + + idx = 0 + size, = struct.unpack("Q", binary_array[idx:idx+8]) + print size + idx += 8 + for _ in xrange(size): + key_size, = struct.unpack("i", binary_array[idx:idx+4]) + idx += 4 + key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) + idx += key_size + params_size, = struct.unpack("i", binary_array[idx:idx+4]) + idx += 4 + data_map[key] = [] + count = params_size / 4 + params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size]) + for i in params: + data_map[key].append(i) + idx += params_size + + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + return env.get_template('str2vec_maps.cc.tmpl').render( + maps = data_map, + data_type = 'unsigned int', + variable_name = FLAGS.variable_name + ) + +def main(unused_args): + cpp_binary_source = generate_cpp_source() + if os.path.isfile(FLAGS.output_path): + os.remove(FLAGS.output_path) + w_file = open(FLAGS.output_path, "w") + w_file.write(cpp_binary_source) + w_file.close() + +def parse_args(): + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--binary_dirs", + type=str, + default="cl_bin0/,cl_bin1/", + help="The binaries file path.") + parser.add_argument( + "--binary_file_name", + type=str, + default="mace_run.config", + help="The binary file name.") + parser.add_argument( + "--output_path", + type=str, + default="", + help="The path of generated C++ source file which contains the binary.") + parser.add_argument( + "--variable_name", + type=str, + default="kTuningParamsData", + help="global variable name.") + return parser.parse_known_args() + + +if __name__ == '__main__': + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..fbbe9f9f01f78059910cff7ea1cf6e8f490f604a --- /dev/null +++ b/mace/python/tools/caffe_converter_lib.py @@ -0,0 +1,752 @@ +from mace.proto import mace_pb2 +from mace.proto import caffe_pb2 +from mace.python.tools import memory_optimizer +import google.protobuf.text_format +import numpy as np +import math + +pooling_type_mode = { + 'AvgPool': 1, + 'MaxPool': 2 +} + +buffer_type_map = { + 'CONV2D_FILTER' : 0, + 'IN_OUT_CHANNEL' : 1, + 'ARGUMENT' : 2, + 'IN_OUT_HEIGHT' : 3, + 'IN_OUT_WIDTH' : 4, + 'WINOGRAD_FILTER' : 5, + 'DW_CONV2D_FILTER' : 6, + 'WEIGHT_HEIGHT' : 7, +} + +data_type_map = { + 'DT_HALF' : mace_pb2.DT_HALF, + 'DT_FLOAT': mace_pb2.DT_FLOAT +} + +activation_name_map = { + 'ReLU' : 'RELU', + 'Sigmoid' : 'SIGMOID', + 'TanH' : 'TANH', +} + +MACE_INPUT_NODE_NAME = "mace_input_node" +MACE_OUTPUT_NODE_NAME = "mace_output_node" + +OPENCL_IMAGE_MAX_SIZE = 16384 + +class Operator(object): + def __init__(self, name, type, layer): + self.name = name + self.type = type + self.layer = layer + self.parents = [] + self.children = [] + self.data = [] + self.output_shape_map = {} + + def add_parent(self, parent_op): + self.parents.append(parent_op) + parent_op.children.append(self) + + def get_single_parent(self): + if len(self.parents) != 1: + raise Exception('Operation %s expected single parent, but got %s' + % (self.name, len(self.parents))) + return self.parents[0] + +def BlobToNPArray(blob): + if blob.num != 0: + return (np.asarray(blob.data, dtype=np.float32). + reshape((blob.num, blob.channels, blob.height, blob.width))) + else: + return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim) + + +class Shapes(object): + @staticmethod + def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func): + output_shape = np.zeros_like(input_shape) + output_shape[0] = input_shape[0] + output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] + - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] + - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 + output_shape[3] = filter_shape[2] + return output_shape + + @staticmethod + def fully_connected_shape(input_shape, weight_shape): + return [input_shape[0], 1, 1, weight_shape[0]] + + @staticmethod + def concat_shape(input_shapes, axis): + output_shape = None + for input_shape in input_shapes: + if output_shape is None: + output_shape = list(input_shape) + else: + output_shape[axis] += input_shape[axis] + return output_shape + + @staticmethod + def slice_shape(input_shape, num_output): + return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] + +# outputs' name is [op.name + '_' + #] +class CaffeConverter(object): + def __init__(self, caffe_net, weights, net_def, dt, device, winograd): + self.net_def = net_def + self.caffe_net = caffe_net + self.weights = weights + self.dt = dt + self.device = device + self.winograd = winograd + self.resolved_ops = set() + self.ops = [] + self.inputs_map = {} # caffe op name -> mace inputs' name + + # Add Input operations + top_name_map = {} + inputs = caffe_net.input + for input in inputs: + self.ops.extend([Operator(input, 'Input', None)]) + top_name_map[input] = input + + layers = caffe_net.layer + # remove train layers and dropout + layers = self.remove_unused_layers(layers) + + # Construct graph + # Only support single-output layer + # layer with single output often use the same top name. + self.ops.extend([Operator(layer.name, layer.type, layer) for layer in layers]) + + self.ops_map = {op.name : op for op in self.ops} + output_op_map = {} + for layer in layers: + op = self.ops_map[layer.name] + for input_name in layer.bottom: + assert input_name != layer.name + parent_op = output_op_map.get(input_name) + if parent_op is None: + parent_op = self.ops_map[input_name] + op.add_parent(parent_op) + if op.name not in self.inputs_map: + self.inputs_map[op.name] = [] + self.inputs_map[op.name].extend([top_name_map[input_name]]) + for i in range(len(layer.top)): + output_name = layer.top[i] + if len(layer.top) == 1: + top_name_map[output_name] = op.name + else: + top_name_map[output_name] = op.name + '_' + str(i) + if output_name == layer.name: + continue + output_op_map[output_name] = op + + + # Load weights + weights_layers = weights.layer + for layer in weights_layers: + if not layer.blobs: + continue + if layer.name in self.ops_map: + op = self.ops_map[layer.name] + op.data = [BlobToNPArray(blob) for blob in layer.blobs] + + # toposort ops + self.ops = self.toposort_ops() + + def CommonConvert(self, op, mace_type): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + op_def.name = op.name + op_def.type = mace_type + op_def.input.extend([name+':0' for name in self.inputs_map[op.name]]) + return op_def + + def remove_unused_layers(self, layers): + phase_map = {0: 'train', 1: 'test'} + test_layers_names = set() + test_layers = [] + for layer in layers: + phase = 'test' + if len(layer.include): + phase = phase_map[layer.include[0].phase] + if len(layer.exclude): + phase = phase_map[layer.exclude[0].phase] + if phase == 'test' and layer.type != 'Dropout': + test_layers.append(layer) + assert layer.name not in test_layers_names + test_layers_names.add(layer.name) + return test_layers + + def toposort_ops(self): + sorted_ops = [] + temp_visited = set() + visited = set() + + def search(op): + if op.name in temp_visited: + raise Exception("The model is not DAG") + if op.name in visited: + return + temp_visited.add(op.name) + for parent_op in op.parents: + search(parent_op) + temp_visited.remove(op.name) + sorted_ops.append(op) + visited.add(op.name) + + for op in self.ops: + search(op) + + return sorted_ops + + def add_buffer_to_image(self, input_name, input_type): + output_name = input_name[:-2] + "_b2i" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'BufferToImage' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_image_to_buffer(self, input_name, input_type): + output_name = input_name[:-2] + "_i2b" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_input_transform(self, names, is_single): + for name in names: + if is_single: + new_input_name = MACE_INPUT_NODE_NAME + ":0" + else: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_output_transform(self, names, is_single): + for name in names: + if is_single: + output_name = MACE_OUTPUT_NODE_NAME + ":0" + else: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + def add_tensor(self, name, value): + tensor = self.net_def.tensors.add() + tensor.name = name + + shape = list(value.shape) + tensor.dims.extend(shape) + + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(value.flat) + + @staticmethod + def add_output_shape(op_def, output_shape): + mace_output_shape = mace_pb2.OutputShape() + mace_output_shape.dims.extend(output_shape) + op_def.output_shape.extend([mace_output_shape]) + + def add_stride_pad_kernel_arg(self, param, op_def): + try: + if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(param.pad) > 1: + raise Exception('Mace does not support multiple stride/kernel_size/pad') + stride = [param.stride[0], param.stride[0]] if len(param.stride) else [1, 1] + pad = [param.pad[0] * 2, param.pad[0] * 2] if len(param.pad) else [0, 0] + kernel = [param.kernel_size[0], param.kernel_size[0]] if len(param.kernel_size) else [0, 0] + except TypeError: + stride = [param.stride, param.stride] + pad = [param.pad * 2, param.pad * 2] + kernel = [param.kernel_size, param.kernel_size] + + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + if param.HasField("stride_h") or param.HasField("stride_w"): + stride = [param.stride_h, param.stride_w] + strides_arg.ints.extend(stride) + # Pad + padding_arg = op_def.arg.add() + padding_arg.name = 'padding_values' + if param.HasField("pad_h") or param.HasField("pad_w"): + pad = [param.pad_h * 2, param.pad_w * 2] + padding_arg.ints.extend(pad) + # kernel + if op_def.type == 'Pooling': + kernel_arg = op_def.arg.add() + kernel_arg.name = 'kernels' + if param.HasField("kernel_h") or param.HasField("kernel_w"): + kernel = [param.kernel_h, param.kernel_w] + kernel_arg.ints.extend(kernel) + return pad, stride, kernel + + def convert_conv2d(self, op): + op_def = self.CommonConvert(op, 'Conv2D') + param = op.layer.convolution_param + + # Add filter + weight_tensor_name = op.name + '_weight:0' + weight_data = op.data[0].transpose((2, 3, 0, 1)) + self.add_tensor(weight_tensor_name, weight_data) + + if self.device == 'gpu': + buffer_type = "CONV2D_FILTER" + output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend([weight_tensor_name]) + + # Add Bias + if len(op.data) == 2: + bias_tensor_name = op.name + '_bias:0' + bias_data = op.data[1].reshape(-1) + self.add_tensor(bias_tensor_name, bias_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([bias_tensor_name]) + + paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def) + dilations = [1, 1] + if len(param.dilation) > 0: + dilation_arg = op_def.arg.add() + dilation_arg.name = 'dilations' + if len(param.dilation) == 1: + dilations = [param.dilation[0], param.dilation[0]] + elif len(param.dilation) == 2: + dilations = [param.dilation[0], param.dilation[1]] + dilation_arg.ints.extend(dilations) + final_op = op + self.resolved_ops.add(op.name) + + output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]], + weight_data.shape, + paddings, strides, dilations, + math.floor) + op.output_shape_map[op.layer.top[0]] = output_shape + + if len(self.ops_map[final_op.name].children) == 1 \ + and self.ops_map[final_op.name].children[0].type in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + op_def.type = "FusedConv2D" + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name+':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def convert_batchnorm(self, op): + if len(op.children) != 1 or op.children[0].type != 'Scale': + raise Exception('Now only support BatchNorm+Scale') + op_def = self.CommonConvert(op, 'FoldedBatchNorm') + scale_op = op.children[0] + + epsilon_value = op.layer.batch_norm_param.eps + if op.data[2][0] != 0: + mean_value = (1. / op.data[2][0]) * op.data[0] + var_value = (1. / op.data[2][0]) * op.data[1] + else: + raise RuntimeError('scalar is zero.') + + gamma_value = scale_op.data[0] + beta_value = np.zeros_like(mean_value) + if len(scale_op.data) == 2: + beta_value = scale_op.data[1] + + scale_value = ( + (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * + gamma_value).reshape(-1) + offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1) + input_names = [op.name+'_scale:0', op.name+'_offset:0'] + self.add_tensor(input_names[0], scale_value) + self.add_tensor(input_names[1], offset_value) + + if self.device == 'gpu': + for name in input_names: + output_name = self.add_buffer_to_image(name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([name for name in input_names]) + + self.resolved_ops.add(op.name) + self.resolved_ops.add(scale_op.name) + final_op = scale_op + + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] + + if len(self.ops_map[final_op.name].children) == 1 \ + and self.ops_map[final_op.name].children[0].type in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def convert_inner_product(self, op): + param = op.layer.inner_product_param + try: + if param.axis != 1 or param.transpose: + raise ValueError('Do not support non-default axis and transpose ' + 'case for innner product') + except AttributeError: + pass + + op_def = self.CommonConvert(op, 'FC') + weight_tensor_name = op.name + '_weight:0' + if op.data[0].ndim not in [2, 4]: + raise ValueError('Unexpected weigth ndim.') + if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]: + raise ValueError('Do not support 4D weight with shape [1, 1, *, *]') + input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] + + weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) + assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3]) + weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) + weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) + self.add_tensor(weight_tensor_name, weight_data) + if self.device == 'gpu': + if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \ + or weight_data.shape[1] > OPENCL_IMAGE_MAX_SIZE: + raise Exception('Mace gpu do not support FC with weight shape: ' + +str(weight_data.shape)) + buffer_type = "WEIGHT_HEIGHT" + output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend([weight_tensor_name]) + + # Add Bias + if len(op.data) == 2: + bias_tensor_name = op.name + '_bias:0' + bias_data = op.data[1].reshape(-1) + self.add_tensor(bias_tensor_name, bias_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([bias_tensor_name]) + + self.resolved_ops.add(op.name) + output_shape = Shapes.fully_connected_shape(input_shape, weight_data.shape) + op.output_shape_map[op.layer.top[0]] = output_shape + final_op = op + + if len(self.ops_map[final_op.name].children) == 1 \ + and self.ops_map[final_op.name].children[0].type in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def convert_pooling(self, op): + op_def = self.CommonConvert(op, 'Pooling') + + param = op.layer.pooling_param + paddings, strides, kernels = self.add_stride_pad_kernel_arg(param, op_def) + if param.pool == caffe_pb2.PoolingParameter.MAX: + pooling_type = "MaxPool" + elif param.pool == caffe_pb2.PoolingParameter.AVE: + pooling_type = "AvgPool" + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[pooling_type] + + input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] + filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] + output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, + paddings, strides, [1, 1], math.ceil) + op.output_shape_map[op.layer.top[0]] = output_shape + + op_def.output.extend([op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_activation(self, op): + op_def = self.CommonConvert(op, 'Activation') + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = activation_name_map[op.type] + op_def.output.extend([op.name + ':0']) + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_prelu(self, op): + op_def = self.CommonConvert(op, 'Activation') + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = 'PRELU' + alpha_tensor_name = op.name + '_alpha:0' + alpha_data = op.data[0].reshape(-1) + self.add_tensor(alpha_tensor_name, alpha_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(alpha_tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([alpha_tensor_name]) + op_def.output.extend([op.name + ':0']) + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_add(self, op): + op_def = self.CommonConvert(op, 'AddN') + op_def.output.extend([op.name + ':0']) + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_concat(self, op): + op_def = self.CommonConvert(op, 'Concat') + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = 3 + try: + if op.layer.concat_param.HasFeild('axis'): + axis_arg.i = op.concat_param.axis + elif op.layer.concat_param.HasFeild('concat_dim'): + axis_arg.i = op.concat_param.concat_dim + except AttributeError: + pass + + input_shapes = [] + for i in range(len(op.parents)): + input_shapes.append(op.parents[i].output_shape_map[op.layer.bottom[i]]) + output_shape = Shapes.concat_shape(input_shapes, axis_arg.i) + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_eltwise(self, op): + op_def = self.CommonConvert(op, 'Eltwise') + param = op.layer.eltwise_param + type_arg = op_def.arg.add() + type_arg.name = 'type' + type_arg.i = param.operation + if len(param.coeff) > 0: + coeff_arg = op_def.arg.add() + coeff_arg.name = 'coeff' + coeff_arg.ints.extend(list(param.coeff)) + + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_slice(self, op): + op_def = self.CommonConvert(op, 'Slice') + if op.layer.HasField('slice_param'): + param = op.layer.slice_param + if param.HasField('axis') and param.axis != 1: + raise Exception('Mace do not support slice with axis ' + str(param.axis)) + if len(param.slice_point) > 0: + raise Exception('Mace do not support slice with slice_point') + + input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + num_outputs = len(op.layer.top) + if (input_shape[3] % num_outputs) != 0 or \ + (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) : + raise Exception('Mace do not support slice with input shape ' + + str(input_shape) + ' and number of output ' + str(num_outputs)) + output_shape = Shapes.slice_shape(input_shape, num_outputs) + for i in range(len(op.layer.top)): + op.output_shape_map[op.layer.top[i]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + '_' + str(i) + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_normal_op(self, op): + op_def = self.CommonConvert(op, op.type) + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def replace_in_out_name(self, input_names, output_names, is_single): + in_names = set([input_name + ":0" for input_name in input_names]) + out_names = set([output_name + ":0" for output_name in output_names]) + if is_single: + for op in self.net_def.op: + if len(op.input) > 0 and op.input[0] in in_names: + op.input[0] = MACE_INPUT_NODE_NAME + ':0' + if len(op.output) > 0 and op.output[0] in out_names: + op.output[0] = MACE_OUTPUT_NODE_NAME + ':0' + else: + for op in self.net_def.op: + if len(op.input) > 0 and op.input[0] in in_names: + op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0] + if len(op.output) > 0 and op.output[0] in out_names: + op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0] + + def add_input_op_shape(self, input_nodes, input_shapes): + assert len(input_nodes) == len(input_shapes) + for i in range(len(input_nodes)): + input_op = self.ops_map[input_nodes[i]] + if input_op.layer is not None: + input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i] + else: + input_op.output_shape_map[input_op.name] = input_shapes[i] + + def convert(self, input_nodes, input_shapes, output_nodes): + is_single = len(input_nodes) == 1 and len(output_nodes) == 1 + if self.device == 'gpu': + self.add_input_transform(input_nodes, is_single) + + assert self.ops[0].type == 'Input' + self.add_input_op_shape(input_nodes, input_shapes) + + for op in self.ops: + if op.name in self.resolved_ops: + continue + if op.type == 'Input': + self.resolved_ops.add(op.name) + elif op.type == 'Convolution': + self.convert_conv2d(op) + elif op.type == 'BatchNorm': + self.convert_batchnorm(op) + elif op.type == 'InnerProduct': + self.convert_inner_product(op) + elif op.type == 'Pooling': + self.convert_pooling(op) + elif op.type == 'PReLU': + self.convert_prelu(op) + elif op.type in ['ReLU', 'Sigmoid', 'TanH']: + self.convert_activation(op) + elif op.type == 'Add': + self.convert_add(op) + elif op.type == 'Concat': + self.convert_concat(op) + elif op.type == 'Eltwise': + self.convert_eltwise(op) + elif op.type in ['Softmax']: + self.convert_normal_op(op) + elif op.type == 'Slice': + self.convert_slice(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + + if self.device == 'gpu': + self.add_output_transform(output_nodes, is_single) + + if self.device == 'cpu': + self.replace_in_out_name(input_nodes, output_nodes, is_single) + + for op in self.ops: + if op.name not in self.resolved_ops: + print 'Unresolve Op: %s with type %s' % (op.name, op.type) + + +def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, output_node_str, data_type, device, winograd): + net_def = mace_pb2.NetDef() + dt = data_type_map[data_type] + + caffe_net = caffe_pb2.NetParameter() + with open(model_file, "r") as f: + google.protobuf.text_format.Merge(str(f.read()), caffe_net) + + weights = caffe_pb2.NetParameter() + with open(weight_file, "rb") as f: + weights.MergeFromString(f.read()) + + input_nodes = [x for x in input_node_str.split(',')] + input_shapes = [] + if input_shape_str != "": + input_shape_strs = [x for x in input_shape_str.split(':')] + for shape_str in input_shape_strs: + input_shapes.extend([[int(x) for x in shape_str.split(',')]]) + output_nodes = [x for x in output_node_str.split(',')] + assert len(input_nodes) == len(input_shapes) + + converter = CaffeConverter(caffe_net, weights, net_def, dt, device, winograd) + converter.convert(input_nodes, input_shapes, output_nodes) + print "PB Converted." + if device == 'gpu': + print "start optimize memory." + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() + print "Memory optimization done." + + return net_def diff --git a/mace/python/tools/caffe_ops_stats.py b/mace/python/tools/caffe_ops_stats.py deleted file mode 100644 index 4eba5b664de816722d370c61757117ef0ffd25fe..0000000000000000000000000000000000000000 --- a/mace/python/tools/caffe_ops_stats.py +++ /dev/null @@ -1,42 +0,0 @@ -from mace.proto import caffe_pb2 -import google.protobuf.text_format -import operator -import functools -import argparse -import sys -import six -import os.path - -FLAGS = None - -def main(unused_args): - if not os.path.isfile(FLAGS.input): - print 'input model file not exist' - return -1 - net = caffe_pb2.NetParameter() - with open(FLAGS.input) as f: - google.protobuf.text_format.Merge(str(f.read()), net) - - ops = {} - for layer in net.layer: - if layer.type not in ops: - ops[layer.type] = 1 - else: - ops[layer.type] += 1 - - for key, value in sorted(ops.items(), key=operator.itemgetter(1)): - print key, ":", value - -def parse_args(): - '''Parses command line arguments.''' - parser = argparse.ArgumentParser() - parser.add_argument( - '--input', - type=str, - default='', - help='Caffe \'GraphDef\' file to load.') - return parser.parse_known_args() - -if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py new file mode 100644 index 0000000000000000000000000000000000000000..53b3196952e991e6163b83ff3ca14395bcba9856 --- /dev/null +++ b/mace/python/tools/convert_util.py @@ -0,0 +1,29 @@ +import tensorflow as tf +from mace.proto import mace_pb2 + +TF_DTYPE_2_MACE_DTYPE_MAP = { + tf.float32: mace_pb2.DT_FLOAT, + tf.double: mace_pb2.DT_DOUBLE, + tf.half: mace_pb2.DT_HALF, + tf.int64: mace_pb2.DT_INT64, + tf.int32: mace_pb2.DT_INT32, + tf.qint32: mace_pb2.DT_INT32, + tf.int16: mace_pb2.DT_INT16, + tf.qint16: mace_pb2.DT_INT16, + tf.int8: mace_pb2.DT_INT8, + tf.qint8: mace_pb2.DT_INT8, + tf.quint16: mace_pb2.DT_UINT16, + tf.uint16: mace_pb2.DT_UINT16, + tf.quint8: mace_pb2.DT_UINT8, + tf.uint8: mace_pb2.DT_UINT8, + tf.string: mace_pb2.DT_STRING, + tf.bool: mace_pb2.DT_BOOL, +} + + +def tf_dtype_2_mace_dtype(tf_dtype): + mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None) + if not mace_dtype: + raise Exception("Not supported tensorflow dtype: " + tf_dtype) + return mace_dtype + diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4b942e7a4b9b6745f9d765d84b9d3d4408ff17 --- /dev/null +++ b/mace/python/tools/converter.py @@ -0,0 +1,179 @@ +import argparse +import sys +import hashlib +import os.path +from mace.python.tools import source_converter_lib + +# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3 + +FLAGS = None + +def file_checksum(fname): + hash_func = hashlib.sha256() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_func.update(chunk) + return hash_func.hexdigest() + +def main(unused_args): + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + + model_checksum = file_checksum(FLAGS.model_file) + if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum: + print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum)) + sys.exit(-1) + + if FLAGS.platform == 'caffe': + if not os.path.isfile(FLAGS.weight_file): + print("Input weight file '" + FLAGS.weight_file + "' does not exist!") + sys.exit(-1) + + weight_checksum = file_checksum(FLAGS.weight_file) + if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum: + print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum)) + sys.exit(-1) + + if FLAGS.runtime == 'dsp': + print("DSP not support caffe model yet.") + sys.exit(-1) + + from mace.python.tools import caffe_converter_lib + output_graph_def = caffe_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, + FLAGS.data_type, FLAGS.runtime, FLAGS.winograd) + elif FLAGS.platform == 'tensorflow': + if FLAGS.runtime == 'dsp': + from mace.python.tools import tf_dsp_converter_lib + output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode) + else: + from mace.python.tools import tf_converter_lib + output_graph_def = tf_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, + FLAGS.data_type, FLAGS.runtime, FLAGS.winograd) + + if FLAGS.output_type == 'source': + source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate, + FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data) + else: + with open(FLAGS.output, "wb") as f: + f.write(output_graph_def.SerializeToString()) + with open(FLAGS.output + '_txt', "wb") as f: + # output_graph_def.ClearField('tensors') + f.write(str(output_graph_def)) + print("Model conversion is completed.") + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +def parse_args(): + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--model_file", + type=str, + default="", + help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.") + parser.add_argument( + "--weight_file", + type=str, + default="", + help="Caffe data file to load.") + parser.add_argument( + "--model_checksum", + type=str, + default="", + help="Model file sha256 checksum") + parser.add_argument( + "--weight_checksum", + type=str, + default="", + help="Weight file sha256 checksum") + parser.add_argument( + "--output", + type=str, + default="", + help="File to save the output graph to.") + parser.add_argument( + "--runtime", + type=str, + default="cpu", + help="Runtime: cpu/gpu/dsp") + parser.add_argument( + "--input_node", + type=str, + default="input_node", + help="e.g., input_node") + parser.add_argument( + "--output_node", + type=str, + default="softmax", + help="e.g., softmax") + parser.add_argument( + "--data_type", + type=str, + default='DT_FLOAT', + help="e.g., DT_HALF/DT_FLOAT") + parser.add_argument( + "--output_type", + type=str, + default="pb", + help="output type: source/pb") + parser.add_argument( + "--template", + type=str, + default="", + help="template path") + parser.add_argument( + "--obfuscate", + type=str2bool, + nargs='?', + const=False, + default=False, + help="obfuscate model names") + parser.add_argument( + "--model_tag", + type=str, + default="", + help="model tag for generated function and namespace") + parser.add_argument( + "--winograd", + type=str2bool, + nargs='?', + const=False, + default=False, + help="open winograd convolution or not") + parser.add_argument( + "--dsp_mode", + type=int, + default=0, + help="dsp run mode, defalut=0") + parser.add_argument( + "--input_shape", + type=str, + default="", + help="input shape.") + parser.add_argument( + "--platform", + type=str, + default="tensorflow", + help="tensorflow/caffe") + parser.add_argument( + "--embed_model_data", + type=str2bool, + default=True, + help="input shape.") + return parser.parse_known_args() + + +if __name__ == '__main__': + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/dsp_ops.py b/mace/python/tools/dsp_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce90a0b27235914188ba34232c7f5557d44ef75 --- /dev/null +++ b/mace/python/tools/dsp_ops.py @@ -0,0 +1,64 @@ + +class DspOps(object): + def __init__(self): + self.dsp_ops = { + 'INPUT': 'INPUT"', + 'OUTPUT': 'OUTPUT', + 'NoOp': 'Nop', + 'FLATTEN': 'Flatten', + 'Identity': 'Nop', + 'Placeholder': 'INPUT', + 'Const': 'Const', + 'QuantizedConv2D': 'QuantizedConv2d_8x8to32', + 'QuantizedMatMul': 'QuantizedMatMul_8x8to32', + 'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8', + 'QuantizedRelu': 'QuantizedRelu_8', + 'QuantizedReluX': 'QuantizedReluX_8', + 'QuantizedMaxPool': 'QuantizedMaxPool_8', + 'QuantizedAvgPool': 'QuantizedAvgPool_8', + 'QuantizedConcat': 'QuantizedConcat_8', + 'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32', + 'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8', + 'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8', + 'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8', + 'QuantizedSoftmax': 'QuantizedSoftmax_8', + 'Min': 'Min_f', + 'Max': 'Max_f', + 'QuantizeV2': 'Quantize', + 'Dequantize': 'Dequantize', + 'Softmax': 'Softmax_f', + 'Reshape': 'Reshape', + 'QuantizedReshape': 'QuantizedReshape', + 'Sigmoid': 'Sigmoid_f', + 'Slice': 'Slice_f', + 'Add': 'Add_f', + 'Mul': 'Mul_f', + 'Requantize': 'Requantize_32to8', + 'RequantizationRange': 'RequantizationRange_32', + 'Sub': 'Sub_f', + 'Pack': 'Pack_int32', + 'StridedSlice': 'StridedSlice_f', + 'ExpandDims': 'ExpandDims_f', + 'QuantizedMul': 'QuantizedMul_8x8to32', + 'QuantizedAdd': 'QuantizedAdd_8p8to32', + 'Pad': 'Pad_f', + 'SpaceToBatchND': 'SpaceToBatchND_f', + 'BatchToSpaceND': 'BatchToSpaceND_f', + 'ResizeBilinear': 'ResizeBilinear_f', + 'ConcatV2': 'ConcatV2_f', + 'Conv2DBackpropInput': 'Deconv_f', + 'Tanh': 'Tanh_f', + 'Split': 'Split_f', + 'Transpose': 'Transpose_f', + 'Concat': 'Concat_f', + 'AddN': 'AddN_f', + } + def has_op(self, tf_op): + return tf_op in self.dsp_ops + + def map_nn_op(self, tf_op): + if tf_op not in self.dsp_ops: + raise Exception('Could not map nn op for: ', tf_op) + return self.dsp_ops[tf_op] + + diff --git a/mace/python/tools/graph_util.py b/mace/python/tools/graph_util.py new file mode 100644 index 0000000000000000000000000000000000000000..61f7e8bcc32c62f933671f85b4c116a32794abd3 --- /dev/null +++ b/mace/python/tools/graph_util.py @@ -0,0 +1,50 @@ +import tensorflow as tf +from mace.proto import mace_pb2 +from collections import OrderedDict + +def sort_tf_node(node, nodes_map, ordered_nodes_map): + if node.name not in ordered_nodes_map: + for input_tensor_name in node.input: + input_node_name = input_tensor_name.split(':')[ + 0] if ':' in input_tensor_name else input_tensor_name + if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: + continue + + input_node = nodes_map[input_node_name] + sort_tf_node(input_node, nodes_map, ordered_nodes_map) + ordered_nodes_map[node.name] = node + +def sort_tf_graph(graph_def): + nodes_map = {} + ordered_nodes_map = OrderedDict() + for node in graph_def.node: + nodes_map[node.name] = node + for node in graph_def.node: + sort_tf_node(node, nodes_map, ordered_nodes_map) + sorted_graph = tf.GraphDef() + sorted_graph.node.extend([node for node in ordered_nodes_map.values()]) + return sorted_graph + + +def sort_mace_node(node, nodes_map, ordered_nodes_map): + if node.name not in ordered_nodes_map: + for input_tensor_name in node.input: + input_node_name = input_tensor_name.split(':')[ + 0] if ':' in input_tensor_name else input_tensor_name + if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: + continue + + input_node = nodes_map[input_node_name] + sort_mace_node(input_node, nodes_map, ordered_nodes_map) + ordered_nodes_map[node.name] = node + +def sort_mace_graph(graph_def, output_name): + nodes_map = {} + ordered_nodes_map = OrderedDict() + for node in graph_def.op: + nodes_map[node.name] = node + sort_mace_node(nodes_map[output_name], nodes_map, ordered_nodes_map) + sorted_graph = mace_pb2.NetDef() + sorted_graph.tensors.extend(graph_def.tensors) + sorted_graph.op.extend([node for node in ordered_nodes_map.values()]) + return sorted_graph diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2e5716976b5a8cafdd22dceee0785b88a199bc11 --- /dev/null +++ b/mace/python/tools/memory_optimizer.py @@ -0,0 +1,106 @@ +import sys +import operator +from mace.proto import mace_pb2 + +class MemoryOptimizer(object): + def __init__(self, net_def): + self.net_def = net_def + self.idle_mem = set() + self.op_mem = {} # op_name->mem_id + self.mem_block = {} # mem_id->[x, y] + self.total_mem_count = 0 + self.ref_counter = {} + + consumers = {} + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + # only ref op's output tensor + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + for output in op.output: + tensor_name = output + if tensor_name in consumers: + self.ref_counter[tensor_name] = len(consumers[tensor_name]) + else: + self.ref_counter[tensor_name] = 0 + + def is_buffer_image_op(self, op): + return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' + + def get_mem_size(self, op_type, output_shape): + mem_size = [0, 0] + if op_type == 'WinogradTransform' or op_type == 'GEMM': + mem_size[0] = output_shape[2] * output_shape[3] + mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4) + else: + mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4) + mem_size[1] = output_shape[0] * output_shape[1] + return mem_size + + def optimize(self): + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + if not op.output_shape: + print('WARNING: There is no output shape information to do memory optimization.') + return + if len(op.output_shape) != len(op.output): + print('WARNING: the number of output shape is not equal to the number of output.') + return + for i in range(len(op.output)): + if len(self.idle_mem) == 0: + # allocate new mem + mem_id = self.total_mem_count + self.total_mem_count += 1 + else: + # reuse mem + mem_id = self.idle_mem.pop() + + op.mem_id.extend([mem_id]) + self.op_mem[op.output[i]] = mem_id + if mem_id not in self.mem_block: + self.mem_block[mem_id] = [0, 0] + mem_size = self.mem_block[mem_id] + op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims) + mem_size[0] = max(mem_size[0], op_mem_size[0]) + mem_size[1] = max(mem_size[1], op_mem_size[1]) + + # de-ref input tensor mem + for ipt in op.input: + if ipt in self.ref_counter: + self.ref_counter[ipt] -= 1 + if self.ref_counter[ipt] == 0: + self.idle_mem.add(self.op_mem[ipt]) + elif self.ref_counter[ipt] < 0: + raise Exception('ref count is less than 0') + + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = self.mem_block[mem][1] + + print('total op: %d', len(self.net_def.op)) + origin_mem_size = 0 + optimized_mem_size = 0 + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + for mem in self.mem_block: + print mem, self.mem_block[mem] + optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) + + print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) + + +def optimize_memory(net_def): + mem_optimizer = MemoryOptimizer(net_def) + mem_optimizer.optimize() diff --git a/mace/python/tools/model.template b/mace/python/tools/model.template new file mode 100644 index 0000000000000000000000000000000000000000..8ad245ef468695444b5c790b7edb7ea0f462c973 --- /dev/null +++ b/mace/python/tools/model.template @@ -0,0 +1,152 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// Generated by the mace converter. DO NOT EDIT! +// + +#include +#include + +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace {{tag}} { + +{% for tensor in tensors %} +extern void CreateTensor{{ tensor.id }}(std::vector &tensors, + const unsigned char *model_data); +{% endfor %} + + +{% for i in range(net.op|length) %} +extern void CreateOperator{{i}}(mace::OperatorDef &op); +{% endfor %} + +} // namespace {{ tag }} + +namespace { + +{% if net.arg|length != 0 %} +void CreateNetArg(mace::NetDef &net_def) { + net_def.mutable_arg().reserve({{ net.arg|length }}); + mace::Argument *arg = nullptr; + {% for arg in net.arg %} + + arg = net_def.add_arg(); + arg->set_name({{ arg.name|tojson }}); + + {%- if arg.HasField('f') %} + arg->set_f({{ arg.f }}); + {% endif %} + + {%- if arg.HasField('i') %} + arg->set_i({{ arg.i }}); + {% endif %} + + {%- if arg.HasField('s') %} + arg->set_s({{ arg.s|tojson }}); + {% endif %} + + {% if arg.floats|length != 0 %} + arg->set_floats({ {{ arg.floats|join(', ') }} }); + {% endif %} + {% if arg.ints|length != 0 %} + arg->set_ints({ {{ arg.ints|join(', ') }} }); + {% endif %} + {% if arg.strings|length != 0 %} + arg->set_strings({ {{ arg.strings|stringfy() }} }); + {% endif %} + + {% endfor %} +} +{% endif %} + +{% if net.output_info | length > 0 %} +void CreateOutputInfo(mace::NetDef &net_def) { + std::vector> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} }; + + std::vector data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} }; + std::vector data_types({{ net.output_info | length }}); + for (int k = 0; k < {{ net.output_info | length }}; ++k) { + data_types[k] = static_cast(data_types_int[k]); + } + net_def.mutable_output_info().resize({{ net.output_info | length }}); + for (int i = 0; i < {{ net.output_info | length }}; ++i) { + net_def.mutable_output_info()[i].set_data_type(data_types[i]); + net_def.mutable_output_info()[i].set_dims(dims[i]); + } +} +{% endif %} + +void CreateOperators(std::vector &ops) { + MACE_LATENCY_LOGGER(1, "Create operators"); + + ops.resize({{ net.op|length }}); + {% for i in range(net.op|length) %} + + mace::{{tag}}::CreateOperator{{i}}(ops[{{i}}]); + {% endfor %} +} + +void CreateTensors(std::vector &tensors, + const unsigned char *model_data) { + MACE_LATENCY_LOGGER(1, "Create tensors"); + tensors.reserve({{ net.tensors|length }}); + + {% for tensor in tensors %} + + mace::{{tag}}::CreateTensor{{tensor.id}}(tensors, model_data); + {% endfor %} +} + + +{% if net.mem_arena.mem_block|length != 0 %} +void CreateMemoryArena(mace::MemoryArena &mem_arena) { + std::vector &mem_block = mem_arena.mutable_mem_block(); + mem_block.reserve({{ net.mem_arena.mem_block|length }}); + + {% for mem_blk in net.mem_arena.mem_block %} + mem_block.emplace_back(mace::MemoryBlock({{ mem_blk.mem_id }}, + {{mem_blk.x}}, + {{mem_blk.y}})); + {% endfor %} + +} +{% endif %} + +} // namespace + +namespace {{tag}} { + +NetDef CreateNet(const unsigned char *model_data) { + MACE_LATENCY_LOGGER(1, "Create net {{ net.name }}"); + NetDef net_def; + net_def.set_name("{{ net.name}}"); + net_def.set_version("{{ net.version }}"); + + {% if net.arg|length != 0 %} + CreateNetArg(net_def); + {% endif %} + + CreateOperators(net_def.mutable_op()); + + CreateTensors(net_def.mutable_tensors(), model_data); + + {% if net.mem_arena.mem_block|length != 0 %} + CreateMemoryArena(net_def.mutable_mem_arena()); + {% endif %} + + {% if net.output_info | length > 0 %} + CreateOutputInfo(net_def); + {% endif %} + + return net_def; +} + +const std::string ModelChecksum() { + return {{ model_pb_checksum|tojson }}; +} + +} // namespace {{tag}} +} // namespace mace diff --git a/mace/python/tools/model_header.template b/mace/python/tools/model_header.template new file mode 100644 index 0000000000000000000000000000000000000000..9f5c776d52bd6456bf3c410216f5b4de1ce1fa58 --- /dev/null +++ b/mace/python/tools/model_header.template @@ -0,0 +1,22 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// Generated by the mace converter. DO NOT EDIT! +// + +#include + +#include "mace/public/mace.h" + +namespace mace { +namespace {{tag}} { + +extern const unsigned char *LoadModelData(const char *model_data_file); + +extern void UnloadModelData(const unsigned char *model_data); + +extern NetDef CreateNet(const unsigned char *model_data); + +extern const std::string ModelChecksum(); + +} // namespace {{ tag }} +} // namespace mace diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..d510932633e5413976973b8cec697634bf4a7a05 --- /dev/null +++ b/mace/python/tools/opencl_codegen.py @@ -0,0 +1,69 @@ +import argparse +import os +import sys + +import numpy as np + +import jinja2 + +# python mace/python/tools/opencl_codegen.py \ +# --cl_binary_dirs=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH} + +FLAGS = None + + +def generate_cpp_source(): + maps = {} + cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",") + for cl_binary_dir in cl_binary_dir_arr: + if not os.path.exists(cl_binary_dir): + print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!") + for file_name in os.listdir(cl_binary_dir): + file_path = os.path.join(cl_binary_dir, file_name) + if file_path[-4:] == ".bin": + # read binary + f = open(file_path, "rb") + binary_array = np.fromfile(f, dtype=np.uint8) + f.close() + + maps[file_name[:-4]] = [] + for ele in binary_array: + maps[file_name[:-4]].append(hex(ele)) + + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + return env.get_template('str2vec_maps.cc.tmpl').render( + maps = maps, + data_type = 'unsigned char', + variable_name = 'kCompiledProgramMap' + ) + + +def main(unused_args): + + cpp_cl_binary_source = generate_cpp_source() + if os.path.isfile(FLAGS.output_path): + os.remove(FLAGS.output_path) + w_file = open(FLAGS.output_path, "w") + w_file.write(cpp_cl_binary_source) + w_file.close() + + +def parse_args(): + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--cl_binary_dirs", + type=str, + default="cl_bin0/,cl_bin1/,cl_bin2/", + help="The cl binaries directories.") + parser.add_argument( + "--output_path", + type=str, + default="./mace/examples/codegen/opencl/opencl_compiled_program.cc", + help="The path of generated C++ header file which contains cl binaries.") + return parser.parse_known_args() + + +if __name__ == '__main__': + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/operator.template b/mace/python/tools/operator.template new file mode 100644 index 0000000000000000000000000000000000000000..bdbcbbcbb3a5d228a818134d21e48fcd57fff079 --- /dev/null +++ b/mace/python/tools/operator.template @@ -0,0 +1,113 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// Generated by the mace converter. DO NOT EDIT! +// + +#include +#include + +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace { + +void UpdateOp(mace::OperatorDef &op, + const std::string &name, + const std::string &type, + const std::vector &inputs, + const std::vector &outputs, + const std::vector &output_types, + uint32_t node_id, + const std::vector &mem_ids) { + op.set_name(name); + op.set_type(type); + op.set_input(inputs); + op.set_output(outputs); + op.set_output_type(output_types); + op.set_node_id(node_id); + op.set_mem_id(mem_ids); +} + +} // namespace +} // namespace mace + +namespace mace { +namespace {{tag}} { + +{% for i in range(start, end) %} + +void CreateOperator{{i}}(mace::OperatorDef &op) { + MACE_LATENCY_LOGGER(2, "Create operator {{ net.op[i].name }}"); + + mace::Argument *arg = nullptr; + {% for arg in net.op[i].arg %} + + arg = op.add_arg(); + arg->set_name({{ arg.name|tojson }}); + + {%- if arg.HasField('f') %} + arg->set_f({{ arg.f }}); + {%- endif %} + {%- if arg.HasField('i') %} + arg->set_i({{ arg.i }}); + {%- endif %} + {%- if arg.HasField('s') %} + arg->set_s({{ arg.s|tojson }}); + {%- endif %} + + {% if arg.floats|length != 0 %} + arg->set_floats({ {{ arg.floats|join(', ') }} }); + {% endif %} + {% if arg.ints|length != 0 %} + arg->set_ints({ {{ arg.ints|join(', ') }} }); + {% endif %} + {% if arg.strings|length != 0 %} + arg->set_strings({ {{ arg.strings|stringfy() }} }); + {% endif %} + {% endfor %} + + {% for shape in net.op[i].output_shape %} + {% if shape.dims | length > 0 %} + op.add_output_shape(mace::OutputShape({ {{ shape.dims|join(', ') }} })); + {% endif %} + {% endfor %} + + std::vector output_types_int({ {{ net.op[i].output_type | join(', ') }} }); + std::vector output_types({{ net.op[i].output_type | length }}); + for (int k = 0; k < {{ net.op[i].output_type | length }}; ++k) { + output_types[k] = static_cast(output_types_int[k]); + } + UpdateOp(op, {{ net.op[i].name|tojson }}, {{ net.op[i].type|tojson}}, + { {{ net.op[i].input|stringfy }} }, + { {{ net.op[i].output|stringfy }} }, + output_types, + {{ net.op[i].node_id }}, + { {{ net.op[i].mem_id | join(', ') }} }); + + {% if runtime == 'dsp' %} + op.set_padding({{ net.op[i].padding }}); + {% if net.op[i].node_input | length > 0 %} + std::vector input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} }); + std::vector input_output_ports({ {{ net.op[i].node_input | map(attribute='output_port') | join(', ')}} }); + + for (size_t i = 0; i < {{ net.op[i].node_input | length }}; ++i) { + mace::NodeInput input(input_node_ids[i], input_output_ports[i]); + op.add_node_input(input); + } + {% endif %} + {% if net.op[i].out_max_byte_size | length > 0 %} + std::vector out_max_byte_sizes {{ net.op[i].out_max_byte_size | replace('[', '{') | replace(']', '}') }}; + for (size_t i = 0; i < {{ net.op[i].out_max_byte_size | length }}; ++i) { + op.add_out_max_byte_size(out_max_byte_sizes[i]); + } + {% endif %} + {% endif %} +} + +{% endfor %} + +} // namespace {{tag}} +} // namespace mace + diff --git a/mace/python/tools/source_converter_lib.py b/mace/python/tools/source_converter_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..52fc6126210355f4aa177319c46fe0f476680d86 --- /dev/null +++ b/mace/python/tools/source_converter_lib.py @@ -0,0 +1,187 @@ +import os +import uuid +import numpy as np +import hashlib + +from mace.proto import mace_pb2 +from jinja2 import Environment, FileSystemLoader + + +GENERATED_NAME = set() + +def generate_obfuscated_name(namespace, name): + md5 = hashlib.md5() + md5.update(namespace) + md5.update(name) + md5_digest = md5.hexdigest() + + name = md5_digest[:8] + while name in GENERATED_NAME: + name = md5_digest + assert name not in GENERATED_NAME + GENERATED_NAME.add(name) + return name + +def generate_tensor_map(tensors): + tensor_map = {} + for t in tensors: + if not tensor_map.has_key(t.name): + tensor_map[t.name] = generate_obfuscated_name("tensor", t.name) + return tensor_map + +def generate_in_out_map(ops, tensor_map): + in_out_map = {} + for op in ops: + op.name = generate_obfuscated_name("op", op.name) + for input_name in op.input: + if not in_out_map.has_key(input_name): + if tensor_map.has_key(input_name): + in_out_map[input_name] = tensor_map[input_name] + else: + in_out_map[input_name] = generate_obfuscated_name("in", input_name) + for output_name in op.output: + if not in_out_map.has_key(output_name): + if tensor_map.has_key(output_name): + in_out_map[output_name] = tensor_map[output_name] + else: + in_out_map[output_name] = generate_obfuscated_name("out", output_name) + return in_out_map + +def obfuscate_name(net_def): + input_node = "mace_input_node" + output_node = "mace_output_node" + tensor_map = generate_tensor_map(net_def.tensors) + in_out_map = generate_in_out_map(net_def.op, tensor_map) + for t in net_def.tensors: + if input_node not in t.name and output_node not in t.name: + t.name = tensor_map[t.name] + for op in net_def.op: + for i in range(len(op.input)): + if input_node not in op.input[i]: + op.input[i] = in_out_map[op.input[i]] + for i in range(len(op.output)): + if output_node not in op.output[i]: + op.output[i] = in_out_map[op.output[i]] + +def rename_tensor(net_def): + tensor_map = {} + for t in net_def.tensors: + if not tensor_map.has_key(t.name): + tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_") + t.name = tensor_map[t.name] + for op in net_def.op: + for i in range(len(op.input)): + if tensor_map.has_key(op.input[i]): + op.input[i] = tensor_map[op.input[i]] + for i in range(len(op.output)): + if tensor_map.has_key(op.output[i]): + op.output[i] = tensor_map[op.output[i]] + +class TensorInfo: + def __init__(self, id, t, runtime): + self.id = id + self.data_type = mace_pb2.DataType.Name(t.data_type) + if t.data_type == mace_pb2.DT_FLOAT: + if runtime == 'gpu': + self.data_type = mace_pb2.DT_HALF + self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes()) + else: + self.data_type = mace_pb2.DT_FLOAT + self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes()) + elif t.data_type == mace_pb2.DT_INT32: + self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes()) + elif t.data_type == mace_pb2.DT_UINT8: + self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist()) + +def stringfy(value): + return ', '.join('"{0}"'.format(w) for w in value) + +def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data): + if obfuscate: + obfuscate_name(net_def) + else: + rename_tensor(net_def) + + # Capture our current directory + print template_dir + + # Create the jinja2 environment. + j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True) + j2_env.filters['stringfy'] = stringfy + output_dir = os.path.dirname(output) + '/' + # generate tensor source files + template_name = 'tensor_source.template' + model_data = [] + offset = 0 + counter = 0 + for t in net_def.tensors: + tensor_info = TensorInfo(counter, t, runtime) + # align + if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0: + padding = 4 - offset % 4 + model_data.extend(bytearray([0] * padding)) + offset += padding + source = j2_env.get_template(template_name).render( + tensor_info = tensor_info, + tensor = t, + tag = model_tag, + runtime = runtime, + offset = offset, + ) + model_data.extend(tensor_info.data) + offset += len(tensor_info.data) + with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f: + f.write(source) + counter += 1 + + # generate tensor data + template_name = 'tensor_data.template' + source = j2_env.get_template(template_name).render( + tag = model_tag, + embed_model_data = embed_model_data, + model_data_size = offset, + model_data = model_data + ) + with open(output_dir + 'tensor_data' + '.cc', "wb") as f: + f.write(source) + if not embed_model_data: + f = open(output_dir + model_tag + '.data', "wb") + f.write(bytearray(model_data)) + f.close() + + # generate op source files + template_name = 'operator.template' + counter = 0 + op_size = len(net_def.op) + for start in range(0, op_size, 10): + source = j2_env.get_template(template_name).render( + start = start, + end = min(start+10, op_size), + net = net_def, + tag = model_tag, + runtime = runtime, + ) + with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f: + f.write(source) + counter += 1 + + # generate model source files + template_name = 'model.template' + tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))] + source = j2_env.get_template(template_name).render( + tensors = tensors, + net = net_def, + tag = model_tag, + runtime = runtime, + model_pb_checksum = mode_pb_checksum + ) + with open(output, "wb") as f: + f.write(source) + + # generate model header file + template_name = 'model_header.template' + source = j2_env.get_template(template_name).render( + tag = model_tag, + ) + with open(output_dir + model_tag + '.h', "wb") as f: + f.write(source) diff --git a/mace/python/tools/tensor_data.template b/mace/python/tools/tensor_data.template new file mode 100644 index 0000000000000000000000000000000000000000..d009b101164e32cff237058239db7d902aca12e5 --- /dev/null +++ b/mace/python/tools/tensor_data.template @@ -0,0 +1,64 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// Generated by the mace converter. DO NOT EDIT! +// + +#include +#include + +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +{% if not embed_model_data %} + +#include +#include +#include +#include +#include + +{% endif %} + +namespace mace { +namespace {{tag}} { + +{% if embed_model_data %} +alignas(4) const unsigned char model_data[{{ model_data_size }}] = { +{% for d in model_data %}{{"0x%02X, " % d }}{%endfor%} +}; +{% endif %} + +const unsigned char *LoadModelData(const char *model_data_file) { +{% if embed_model_data %} + return model_data; +{% else %} + int fd = open(model_data_file, O_RDONLY); + MACE_CHECK(fd >= 0, "Failed to open model data file ", + model_data_file, ", error code: ", errno); + + const unsigned char *model_data = + static_cast(mmap(nullptr, {{ model_data_size }}, + PROT_READ, MAP_PRIVATE, fd, 0)); + MACE_CHECK(model_data != MAP_FAILED, "Failed to map model data file ", + model_data_file, ", error code: ", errno); + + int ret = close(fd); + MACE_CHECK(ret == 0, "Failed to close model data file ", + model_data_file, ", error code: ", errno); + + return model_data; +{% endif %} +} + +void UnloadModelData(const unsigned char *model_data) { +{% if not embed_model_data %} + int ret = munmap(const_cast(model_data), + {{ model_data_size }}); + MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", errno); +{% endif %} +} + +} // namespace {{tag}} +} // namespace mace + diff --git a/mace/python/tools/tensor_source.template b/mace/python/tools/tensor_source.template new file mode 100644 index 0000000000000000000000000000000000000000..c321112ed08fa74356f0b64abeb6887ea3025542 --- /dev/null +++ b/mace/python/tools/tensor_source.template @@ -0,0 +1,26 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// Generated by the mace converter. DO NOT EDIT! +// + +#include +#include + +#include "mace/public/mace.h" +#include "mace/utils/env_time.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace {{tag}} { + +void CreateTensor{{tensor_info.id}}(std::vector &tensors, + const unsigned char *model_data) { + MACE_LATENCY_LOGGER(2, "Create tensor {{ tensor.name }}"); + tensors.emplace_back(mace::ConstTensor( + {{ tensor.name|tojson }}, model_data + {{ offset }}, + { {{ tensor.dims|join(', ') }} }, {{ tensor_info.data_type }}, {{ tensor.node_id }})); +} + +} // namespace {{tag}} +} // namespace mace + diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..5b488b1e526ee2689d26b9678d9ac72b62564d8c --- /dev/null +++ b/mace/python/tools/tf_converter_lib.py @@ -0,0 +1,1047 @@ +from mace.proto import mace_pb2 +import tensorflow as tf +import numpy as np +import math +import copy +from tensorflow import gfile +from mace.python.tools import memory_optimizer +from tensorflow.core.framework import graph_pb2 +from tensorflow.core.framework import tensor_shape_pb2 + +# TODO: support NCHW formt, now only support NHWC. +padding_mode = { + 'VALID': 0, + 'SAME': 1, + 'FULL': 2 +} +pooling_type_mode = { + 'AvgPool': 1, + 'MaxPool': 2 +} + +buffer_type_map = { + 'CONV2D_FILTER' : 0, + 'IN_OUT_CHANNEL' : 1, + 'ARGUMENT' : 2, + 'IN_OUT_HEIGHT' : 3, + 'IN_OUT_WIDTH' : 4, + 'WINOGRAD_FILTER' : 5, + 'DW_CONV2D_FILTER' : 6, +} + +data_type_map = { + 'DT_HALF' : mace_pb2.DT_HALF, + 'DT_FLOAT': mace_pb2.DT_FLOAT +} + +activation_name_map = { + 'Relu' : 'RELU', + 'Sigmoid' : 'SIGMOID', + 'Tanh' : 'TANH', + 'Relu6' : 'RELUX' +} + +BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"] + +MACE_INPUT_NODE_NAME = "mace_input_node" +MACE_OUTPUT_NODE_NAME = "mace_output_node" + +OPENCL_IMAGE_MAX_SIZE = 16384 + +def get_input_tensor(op, index): + input_tensor = op.inputs[index] + if input_tensor.op.type == 'Reshape': + input_tensor = get_input_tensor(input_tensor.op, 0) + return input_tensor + +class TFConverter(object): + def __init__(self, tf_ops, net_def, dt, device, winograd): + self.net_def = net_def + self.tf_ops = tf_ops + self.dt = dt + self.device = device + self.winograd = winograd + self.tf_graph = {} + self.tf_parents = {} + self.resolved_ops = {} + self.unused_tensor = set() + self.transpose_filter_tensor = {} + self.reshape_tensor = {} + self.ops = {} + + for op in tf_ops: + self.ops[op.name] = op + + for op in tf_ops: + self.resolved_ops[op.name] = 0 + for input in op.inputs: + input_name = input.name[:-2] + if input_name not in self.tf_graph: + self.tf_graph[input_name] = [] + self.tf_graph[input_name].append(op) + if op.name not in self.tf_parents: + self.tf_parents[op.name] = [] + self.tf_parents[op.name].append(self.ops[input_name]) + + def add_buffer_to_image(self, input_name, input_type): + output_name = input_name[:-2] + "_b2i" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'BufferToImage' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_image_to_buffer(self, input_name, input_type): + output_name = input_name[:-2] + "_i2b" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_input_transform(self, names, is_single): + for name in names: + if is_single: + new_input_name = MACE_INPUT_NODE_NAME + ":0" + else: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_output_transform(self, names, is_single): + for name in names: + if is_single: + output_name = MACE_OUTPUT_NODE_NAME + ":0" + else: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + @staticmethod + def add_output_shape(outputs, op): + output_shapes = [] + for output in outputs: + if output.shape.num_elements() is not None: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op.output_shape.extend(output_shapes) + + def add_tensor(self, name, shape, tf_dt, value): + tensor = self.net_def.tensors.add() + tensor.name = name + + shape = list(shape) + tensor.dims.extend(shape) + + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(value.flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(value.flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + + def convert_reshape(self, op): + input_tensor = get_input_tensor(op, 0) + shape_tensor = get_input_tensor(op, 1) + shape_value = shape_tensor.eval().astype(np.int32) + self.unused_tensor.add(shape_tensor.name) + self.reshape_tensor[input_tensor.name] = shape_value + self.resolved_ops[op.name] = 1 + + def convert_tensor(self, op): + output_name = op.outputs[0].name + if output_name not in self.unused_tensor: + tensor = self.net_def.tensors.add() + tf_tensor = op.outputs[0].eval() + if output_name in self.transpose_filter_tensor: + tf_tensor = tf_tensor.transpose(self.transpose_filter_tensor[output_name]) + if output_name in self.reshape_tensor: + tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name]) + tensor.name = op.outputs[0].name + + shape = list(tf_tensor.shape) + tensor.dims.extend(shape) + + tf_dt = op.get_attr('dtype') + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(tf_tensor.astype(np.float32).flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + self.resolved_ops[op.name] = 1 + + def check_winograd_conv(self, op): + filter_shape = get_input_tensor(op, 1).shape.as_list() + strides = op.get_attr('strides')[1:3] + output_shape = op.outputs[0].shape.as_list() + if len(output_shape) == 0 or output_shape[0] is None: + return False + width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) + return self.winograd and op.type != 'DepthwiseConv2dNative' and self.device == 'gpu' and \ + filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ + (strides[0] == 1) and (strides[0] == strides[1]) and \ + (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ + (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ + (width < OPENCL_IMAGE_MAX_SIZE) + + def convert_winograd_conv(self, op): + filter_tensor = get_input_tensor(op, 1) + filter_shape = filter_tensor.shape.as_list() + output_shape = op.outputs[0].shape.as_list() + + self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1) + filter_name = self.add_buffer_to_image(op.inputs[1].name, "WINOGRAD_FILTER") + + # Input transform + wt_op = mace_pb2.OperatorDef() + arg = wt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + padding_arg = wt_op.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + wt_op.name = op.name + '_input_transform' + wt_op.type = 'WinogradTransform' + wt_op.input.extend([op.inputs[0].name]) + wt_output_name = wt_op.name + ":0" + wt_op.output.extend([wt_output_name]) + wt_output_shape = mace_pb2.OutputShape() + wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) + wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + wt_op.output_shape.extend([wt_output_shape]) + + # MatMul + matmul_op = mace_pb2.OperatorDef() + arg = matmul_op.arg.add() + arg.name = 'T' + arg.i = self.dt + matmul_op.name = op.name + '_matmul' + matmul_op.type = 'MatMul' + matmul_op.input.extend([filter_name, wt_output_name]) + matmul_output_name = matmul_op.name + ":0" + matmul_op.output.extend([matmul_output_name]) + matmul_output_shape = mace_pb2.OutputShape() + matmul_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) + matmul_op.output_shape.extend([matmul_output_shape]) + + # Inverse transform + iwt_op = mace_pb2.OperatorDef() + arg = iwt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + batch_arg = iwt_op.arg.add() + batch_arg.name = 'batch' + batch_arg.i = output_shape[0] + height_arg = iwt_op.arg.add() + height_arg.name = 'height' + height_arg.i = output_shape[1] + width_arg = iwt_op.arg.add() + width_arg.name = 'width' + width_arg.i = output_shape[2] + iwt_op.name = op.name + '_inverse_transform' + iwt_op.type = 'WinogradInverseTransform' + iwt_op.input.extend([matmul_output_name]) + + final_op = op + self.resolved_ops[op.name] = 1 + + if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' : + bias_add_op = self.tf_graph[op.name][0] + output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + iwt_op.input.extend([output_name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 \ + and self.tf_graph[final_op.name][0].type in activation_name_map: + activation_op = self.tf_graph[final_op.name][0] + fused_act_arg = iwt_op.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = iwt_op.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + iwt_op.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, iwt_op) + self.net_def.op.extend([wt_op, matmul_op, iwt_op]) + + + def convert_conv2d(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + if op.type == 'DepthwiseConv2dNative': + op_def.type = 'DepthwiseConv2d' + else: + op_def.type = op.type + self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2) + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER" + output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(op, i).name for i in range(len(op.inputs))]) + + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + final_op = op + self.resolved_ops[op.name] = 1 + + if len(self.tf_graph.get(op.name, [])) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd': + bias_add_op = self.tf_graph[op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph.get(final_op.name, [])) == 1 \ + and self.tf_graph[final_op.name][0].type in activation_name_map: + activation_op = self.tf_graph[final_op.name][0] + op_def.type = "FusedConv2D" + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def convert_fused_batchnorm(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + op_def.name = op.name + op_def.type = 'FoldedBatchNorm' + + gamma_tensor = get_input_tensor(op, 1) + for i in range(1, 5): + input_tensor = get_input_tensor(op, i) + assert input_tensor.shape == gamma_tensor.shape + self.unused_tensor.add(input_tensor.name) + + gamma_value = get_input_tensor(op, 1).eval().astype(np.float32) + beta_value = get_input_tensor(op, 2).eval().astype(np.float32) + mean_value = get_input_tensor(op, 3).eval().astype(np.float32) + var_value = get_input_tensor(op, 4).eval().astype(np.float32) + epsilon_value = op.get_attr('epsilon') + + scale_value = ( + (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * + gamma_value) + offset_value = (-mean_value * scale_value) + beta_value + idx = gamma_tensor.name.rfind('/') + name_prefix = gamma_tensor.name[:idx] + '/' + input_names = [name_prefix+'scale:0', name_prefix+'offset:0'] + self.add_tensor(input_names[0], gamma_value.shape, + gamma_tensor.dtype, scale_value) + self.add_tensor(input_names[1], gamma_value.shape, + gamma_tensor.dtype, offset_value) + + op_def.input.extend([op.inputs[0].name]) + if self.device == 'gpu': + for name in input_names: + output_name = self.add_buffer_to_image(name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([name for name in input_names]) + + self.resolved_ops[op.name] = 1 + final_op = op + + if len(self.tf_graph[op.name]) == 1 \ + and self.tf_graph[op.name][0].type in activation_name_map: + activation_op = self.tf_graph[op.name][0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + op_def.output.extend([final_op.outputs[0].name]) + self.add_output_shape([final_op.outputs[0]], op_def) + + self.net_def.op.extend([op_def]) + + def convert_batchnorm(self, op): + bn_ops = [] + bn_ops.append(op) + for i in range(1, 3): + if len(self.tf_graph[bn_ops[i-1].name]) == 1 \ + and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: + bn_ops.append(self.tf_graph[bn_ops[i-1].name][0]) + else: + raise Exception('Invalid BatchNorm Op') + if len(self.tf_graph[bn_ops[2].name]) == 2 \ + and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \ + and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]: + bn_ops.append(self.tf_graph[bn_ops[2].name][0]) + bn_ops.append(self.tf_graph[bn_ops[2].name][1]) + else: + raise Exception('Invalid BatchNorm Op') + bn_ops.append(self.tf_graph[bn_ops[4].name][0]) + bn_ops.append(self.tf_graph[bn_ops[3].name][0]) + + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + input_name = get_input_tensor(bn_ops[3], 0).name + gamma = get_input_tensor(bn_ops[2], 1).name + beta = get_input_tensor(bn_ops[5], 0).name + mean = get_input_tensor(bn_ops[4], 0).name + variance = get_input_tensor(bn_ops[0], 0).name + + op_def.name = op.name[:-4] # remove /add + op_def.type = 'BatchNorm' + if self.device == 'gpu': + op_def.input.extend([input_name]) + for tensor_name in [gamma, beta, mean, variance]: + output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([input_name, gamma, beta, mean, variance]) + op_def.output.extend([output.name for output in bn_ops[6].outputs]) + self.add_output_shape(bn_ops[6].outputs, op_def) + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'epsilon' + epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + self.unused_tensor.add(get_input_tensor(op, 1).name) + + self.net_def.op.extend([op_def]) + for i in range(0, 7): + self.resolved_ops[bn_ops[i].name] = 1 + + def convert_pooling(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Pooling' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[op.type] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + + def convert_global_avg_pooling(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Pooling' + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode['AvgPool'] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode['VALID'] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend([1, 1]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + + def convert_activation(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Activation' + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = activation_name_map[op.type] + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_relu6(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Activation' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = "RELUX" + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + self.resolved_ops[op.name] = 1 + + def convert_add(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "AddN" + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_concat(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "Concat" + op_def.input.extend([input.name for input in op.inputs[:-1]]) + op_def.output.extend([output.name for output in op.outputs]) + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name) + + def convert_resize_bilinear(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "ResizeBilinear" + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'size' + size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + size_arg.name = 'align_corners' + size_arg.i = op.get_attr('align_corners') + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + self.unused_tensor.add(get_input_tensor(op, 1).name) + + def convert_bias_add(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "BiasAdd" + op_def.input.extend([op.inputs[0].name]) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(op, 1).name]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.net_def.op.extend([op_def]) + self.resolved_ops[op.name] = 1 + + def convert_space_to_batch(self, op, b2s): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'block_shape' + size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + if b2s: + size_arg.name = 'crops' + else: + size_arg.name = 'paddings' + size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + self.unused_tensor.add(get_input_tensor(op, 1).name) + self.unused_tensor.add(get_input_tensor(op, 2).name) + + def is_atrous_conv2d(self, op): + return op.type == 'SpaceToBatchND' and\ + len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D' + + def convert_atrous_conv2d(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + conv_op = self.tf_graph[op.name][0] + op_def.name = conv_op.name + op_def.type = conv_op.type + self.transpose_filter_tensor[get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2) + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + output_name = self.add_buffer_to_image(get_input_tensor(conv_op, 1).name, "CONV2D_FILTER") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(op, 0).name]) + op_def.input.extend([get_input_tensor(conv_op, 1).name]) + + dilation_arg = op_def.arg.add() + dilation_arg.name = 'dilations' + dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat + if len(padding_values) > 0 and padding_values[0] > 0: + padding_arg.i = padding_mode['SAME'] + else: + padding_arg.i = padding_mode['VALID'] + self.unused_tensor.add(get_input_tensor(op, 1).name) + self.unused_tensor.add(get_input_tensor(op, 2).name) + + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend([1, 1]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + final_op = conv_op + self.resolved_ops[op.name] = 1 + self.resolved_ops[conv_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' : + bias_add_op = self.tf_graph[final_op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 \ + and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND': + final_op = self.tf_graph[final_op.name][0] + self.resolved_ops[final_op.name] = 1 + self.unused_tensor.add(get_input_tensor(final_op, 1).name) + self.unused_tensor.add(get_input_tensor(final_op, 2).name) + else: + raise Exception('Convert atrous conv error: no BatchToSpaceND op') + + if len(self.tf_graph[final_op.name]) == 1 \ + and self.tf_graph[final_op.name][0].type == 'Relu': + relu_op = self.tf_graph[final_op.name][0] + op_def.type = "FusedConv2D" + fused_relu_arg = op_def.arg.add() + fused_relu_arg.name = 'activation' + fused_relu_arg.s = "RELU" + final_op = relu_op + self.resolved_ops[relu_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def is_softmax(self, op): + return op.type == 'Softmax' and \ + len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \ + len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape' + + def convert_softmax(self, softmax_op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + # deal with first Reshape op + parent_reshape_op = self.tf_parents[softmax_op.name][0] + self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name) + self.resolved_ops[parent_reshape_op.name] = 1 + + # FIXME: hardcode for inception_v3 + # remove squeeze if exist + squeeze_op = self.tf_parents[parent_reshape_op.name][0] + if squeeze_op.type == 'Squeeze': + op_def.input.extend([squeeze_op.inputs[0].name]) + self.resolved_ops[squeeze_op.name] = 1 + # remove shape if exist + children_ops = self.tf_graph[squeeze_op.name] + print children_ops + if len(children_ops) > 1 and children_ops[0].type == 'Shape': + self.unused_tensor.add(get_input_tensor(children_ops[1], 0).name) + self.resolved_ops[children_ops[1].name] = 1 + else: + op_def.input.extend([parent_reshape_op.inputs[0].name]) + + # deal with Softmax op + op_def.name = softmax_op.name + op_def.type = softmax_op.type + self.resolved_ops[softmax_op.name] = 1 + + # deal with last Reshape op + reshape_op = self.tf_graph[softmax_op.name][0] + self.unused_tensor.add(get_input_tensor(reshape_op, 1).name) + + if reshape_op.outputs[0].shape.ndims == 2: + shape = reshape_op.outputs[0].shape + from tensorflow.python.framework.tensor_shape import as_shape + reshape_op.outputs[0]._shape = as_shape([1, 1, shape[0], shape[1]]) + op_def.output.extend([output.name for output in reshape_op.outputs]) + self.add_output_shape(reshape_op.outputs, op_def) + self.resolved_ops[reshape_op.name] = 1 + + def convert_normal_op(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def replace_in_out_name(self, input_names, output_names, is_single): + in_names = set([input_name + ":0" for input_name in input_names]) + out_names = set([output_name + ":0" for output_name in output_names]) + if is_single: + for op in self.net_def.op: + if len(op.input) > 0 and op.input[0] in in_names: + op.input[0] = MACE_INPUT_NODE_NAME + ':0' + if len(op.output) > 0 and op.output[0] in out_names: + op.output[0] = MACE_OUTPUT_NODE_NAME + ':0' + else: + for op in self.net_def.op: + if len(op.input) > 0 and op.input[0] in in_names: + op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0] + if len(op.output) > 0 and op.output[0] in out_names: + op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0] + + def convert(self, input_nodes, output_nodes): + is_single = len(input_nodes) == 1 and len(output_nodes) == 1 + if self.device == 'gpu': + self.add_input_transform(input_nodes, is_single) + + for op in self.tf_ops: + if self.resolved_ops[op.name] == 1: + continue + if op.type in ['Placeholder', 'Identity']: + self.resolved_ops[op.name] = 1 + pass + elif op.type == 'Const': + pass + elif op.type == 'Reshape': + self.convert_reshape(op) + elif self.is_atrous_conv2d(op): + self.convert_atrous_conv2d(op) + elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': + if self.check_winograd_conv(op): + self.convert_winograd_conv(op) + else: + self.convert_conv2d(op) + elif op.type == 'FusedBatchNorm': + self.convert_fused_batchnorm(op) + elif op.type == 'Add' and op.name.endswith('batchnorm/add'): + self.convert_batchnorm(op) + elif op.type == 'AvgPool' or op.type == 'MaxPool': + self.convert_pooling(op) + elif op.type == 'Relu6': + self.convert_relu6(op) + elif op.type == 'Add': + self.convert_add(op) + elif op.type == 'ConcatV2': + self.convert_concat(op) + elif op.type == 'ResizeBilinear': + self.convert_resize_bilinear(op) + elif op.type == 'BiasAdd': + self.convert_bias_add(op) + elif op.type == 'SpaceToBatchND': + self.convert_space_to_batch(op, False) + elif op.type == 'BatchToSpaceND': + self.convert_space_to_batch(op, True) + elif self.is_softmax(op): + self.convert_softmax(op) + elif op.type in ['Relu', 'Sigmoid', 'Tanh']: + self.convert_activation(op) + # FIXME: hardcode for inception_v3 + elif op.type in ['Squeeze', 'Shape']: + self.resolved_ops[op.name] = 1 + elif op.type == 'Mean': + # Global avg pooling + reduce_dims = op.inputs[1].eval() + if reduce_dims[0] == 1 and reduce_dims[1] == 2: + self.convert_global_avg_pooling(op) + self.unused_tensor.add(op.inputs[1].name) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + #elif op.type in ['']: + # self.convert_normal_op(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + + for op in self.tf_ops: + if self.resolved_ops[op.name] == 1: + continue + elif op.type == 'Const': + self.convert_tensor(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + + if self.device == 'gpu': + self.add_output_transform(output_nodes, is_single) + + if self.device == 'cpu': + self.replace_in_out_name(input_nodes, output_nodes, is_single) + + for key in self.resolved_ops: + if self.resolved_ops[key] != 1: + print 'Unresolve Op: %s' % key + +class Optimizer: + def __init__(self, net_def, device): + self.net_def = net_def + self.device = device + self.mace_graph = {} + self.tensor_map = {} + for op in net_def.op: + for input_name in op.input: + if input_name not in self.mace_graph: + self.mace_graph[input_name] = [] + self.mace_graph[input_name].append(op) + + for tensor in net_def.tensors: + self.tensor_map[tensor.name] = tensor + + def get_buffer_tensor_name(self, name): + if self.device == 'gpu': + return name[:-6] + name[-2:] + else: + return name + + def fold_batch_norm(self): + unused_tensors = set() + new_tensors = [] + new_net = mace_pb2.NetDef() + resolved_ops = set() + + for op in self.net_def.op: + if op.name in resolved_ops: + pass + elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 \ + and self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm': + depthwise_conv2d_op = op + folded_bn_op = self.mace_graph[op.output[0]][0] + weight_buffer_name = self.get_buffer_tensor_name(depthwise_conv2d_op.input[1]) + weight_tensor = self.tensor_map[weight_buffer_name] + scale_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[1]) + offset_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[2]) + scale_tensor = self.tensor_map[scale_buffer_name] + weight_shape = weight_tensor.dims + idx = 0 + for i in range(weight_shape[0]): + for j in range(weight_shape[1]): + for ic in range(weight_shape[2]): + for oc in range(weight_shape[3]): + weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc] + idx += 1 + + new_tensors.append(weight_tensor) + unused_tensors.add(weight_tensor.name) + unused_tensors.add(scale_tensor.name) + + if self.device == 'gpu': + scale_b2i_op = self.mace_graph[scale_buffer_name][0] + offset_b2i_op = self.mace_graph[offset_buffer_name][0] + resolved_ops.add(scale_b2i_op.name) + resolved_ops.add(offset_b2i_op.name) + new_net.op.extend([offset_b2i_op]) + + resolved_ops.add(depthwise_conv2d_op.name) + resolved_ops.add(folded_bn_op.name) + + offset_tensor_name = folded_bn_op.input[2] + depthwise_conv2d_op.input.extend([offset_tensor_name]) + + for arg in folded_bn_op.arg: + if arg.name == 'activation': + act_arg = depthwise_conv2d_op.arg.add() + act_arg.name = arg.name + act_arg.s = arg.s + elif arg.name == 'max_limit': + act_arg = depthwise_conv2d_op.arg.add() + act_arg.name = arg.name + act_arg.f = arg.f + + depthwise_conv2d_op.output[0] = folded_bn_op.output[0] + new_net.op.extend([depthwise_conv2d_op]) + else: + new_net.op.extend([op]) + + for tensor in self.net_def.tensors: + if tensor.name in unused_tensors: + pass + else: + new_net.tensors.extend([tensor]) + + for tensor in new_tensors: + new_net.tensors.extend([tensor]) + + return new_net + + def optimize(self): + new_net = self.fold_batch_norm() + return new_net + +def add_shape_info(input_graph_def, input_nodes, input_shapes): + inputs_replaced_graph = graph_pb2.GraphDef() + for node in input_graph_def.node: + if node.name in input_nodes: + idx = input_nodes.index(node.name) + input_shape = input_shapes[idx] + placeholder_node = copy.deepcopy(node) + placeholder_node.attr.clear() + placeholder_node.attr['shape'].shape.dim.extend([ + tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in input_shape + ]) + placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) + inputs_replaced_graph.node.extend([placeholder_node]) + else: + inputs_replaced_graph.node.extend([copy.deepcopy(node)]) + return inputs_replaced_graph + + +def convert_to_mace_pb(model_file, input_node, input_shape, output_node, data_type, device, winograd): + net_def = mace_pb2.NetDef() + dt = data_type_map[data_type] + + input_graph_def = tf.GraphDef() + with gfile.Open(model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) + + input_nodes = [x for x in input_node.split(',')] + input_shapes = [] + if input_shape != "": + input_shape_strs = [x for x in input_shape.split(':')] + for shape_str in input_shape_strs: + input_shapes.extend([[int(x) for x in shape_str.split(',')]]) + output_nodes = [x for x in output_node.split(',')] + assert len(input_nodes) == len(input_shapes) + + input_graph_def = add_shape_info(input_graph_def, input_nodes, input_shapes) + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + ops = graph.get_operations() + converter = TFConverter(ops, net_def, dt, device, winograd) + converter.convert(input_nodes, output_nodes) + optimizer = Optimizer(net_def, device) + net_def = optimizer.optimize() + print "Model Converted." + if device == 'gpu': + print "start optimize memory." + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() + print "Memory optimization done." + + return net_def diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..f53c25aa29753593ef21d670a5325a72403347da --- /dev/null +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -0,0 +1,408 @@ +from mace.proto import mace_pb2 +import tensorflow as tf +from tensorflow import gfile +from operator import mul +from dsp_ops import DspOps +from mace.python.tools import graph_util +from mace.python.tools.convert_util import tf_dtype_2_mace_dtype + +# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \ +# --runtime dsp --input_node input_node --output_node output_node + +padding_mode = { + 'NA': 0, + 'SAME': 1, + 'VALID': 2, + 'MIRROR_REFLECT': 3, + 'MIRROR_SYMMETRIC': 4, + 'SAME_CAFFE': 5 +} + +def get_tensor_name_from_op(op_name, port): + return op_name + ':' + str(port) + +def get_node_from_map(op_map, op_or_tensor_name): + op_name = op_or_tensor_name.split(':')[0] + return op_map[op_name] + +def get_op_and_port_from_tensor(tensor_name): + op, port = tensor_name.split(':') + port = int(port) + return op, port + +def max_elem_size(tensor): + if len(tensor.shape.as_list()) == 0: + return tensor.dtype.size + else: + return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size + +def find_dtype(tensor_dtype): + if tensor_dtype == tf.float32: + return mace_pb2.DT_FLOAT + elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8: + return mace_pb2.DT_UINT8 + elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32: + return mace_pb2.DT_INT32 + else: + raise Exception('Unsupported data type: ', tensor_dtype) + +def has_padding_and_strides(op): + return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr + +def is_node_flatten_reshape(op): + return op.type == 'Reshape' and len(op.outputs[0].shape) == 1 + +def get_input_tensor(op, index): + input_tensor = op.inputs[index] + if input_tensor.op.type == 'Reshape': + input_tensor = get_input_tensor(input_tensor.op, 0) + return input_tensor + +def add_shape_const_node(net_def, op, values, name): + print ('Add const node: ', op.name + '/' + name) + tensor = net_def.tensors.add() + node_name = op.name + '/' + name + tensor.name = node_name + ':0' + tensor.data_type = mace_pb2.DT_INT32 + tensor.dims.extend(values) + return tensor.name + + +def convert_op_outputs(mace_op_def, tf_op): + mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype) + for output in tf_op.outputs]) + output_shapes = [] + for output in tf_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + mace_op_def.output_shape.extend(output_shapes) + + +def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): + first_op = unresolved_ops[0] + print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape) + + if first_op.name in resolved_ops: + pass + + elif first_op.type == 'Const': + print ('Add const node: ', first_op.name) + tf_tensor = first_op.outputs[0].eval() + tensor = net_def.tensors.add() + tensor.name = first_op.outputs[0].name + tensor.data_type = find_dtype(first_op.outputs[0].dtype) + shape = list(tf_tensor.shape) + if len(shape) > 0: + tensor.dims.extend(shape) + if first_op.outputs[0].dtype == tf.float32: + tensor.float_data.extend(tf_tensor.astype(float).flat) + elif first_op.outputs[0].dtype == tf.int32 or \ + first_op.outputs[0].dtype == tf.int8 or \ + first_op.outputs[0].dtype == tf.int16 or \ + first_op.outputs[0].dtype == tf.quint8 or \ + first_op.outputs[0].dtype == tf.quint16: + tensor.int32_data.extend(tf_tensor.astype(int).flat) + + else: + op_def = net_def.op.add() + op_def.name = first_op.name + op_def.type = dsp_ops.map_nn_op(first_op.type) + op_def.padding = padding_mode['NA'] + + if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \ + and len(first_op.outputs[0].consumers()) > 0 \ + and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \ + or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'): + input_tensor = first_op.inputs[0] + min_tensor = first_op.inputs[1] + max_tensor = first_op.inputs[2] + s2b_op = first_op.outputs[0].consumers()[0] + reshape_op = s2b_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + resolved_ops.add(s2b_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + + op_def.name = quantize_op.name + op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type) + op_def.input.append(input_tensor.name) + op_def.input.extend([t.name for t in s2b_op.inputs[1:]]) + op_def.input.extend([min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs]) + convert_op_outputs(op_def, quantize_op) + elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \ + and len(first_op.outputs[0].consumers()) > 0 \ + and first_op.outputs[0].consumers()[0].type == 'Dequantize' \ + and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \ + and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax': + input_tensor = first_op.inputs[0] + min_tensor = first_op.inputs[2] + max_tensor = first_op.inputs[3] + dequantize_op = first_op.outputs[0].consumers()[0] + softmax_op = dequantize_op.outputs[0].consumers()[0] + reshape_op = softmax_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + quantize_reshape_op = quantize_op.outputs[0].consumers()[0] + + resolved_ops.add(dequantize_op.name) + resolved_ops.add(softmax_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + resolved_ops.add(quantize_reshape_op.name) + + op_def.name = quantize_reshape_op.name + op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax') + op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs]) + convert_op_outputs(op_def, quantize_reshape_op) + elif has_padding_and_strides(first_op): + op_def.padding = padding_mode[first_op.get_attr('padding')] + op_def.input.extend([t.name for t in first_op.inputs]) + if 'ksize' in first_op.node_def.attr: + ksize = first_op.get_attr('ksize') + ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize') + op_def.input.extend([ksize_tensor]) + strides = first_op.get_attr('strides') + strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides') + op_def.input.extend([strides_tensor]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + elif is_node_flatten_reshape(first_op): + op_def.type = 'Flatten' + op_def.input.extend([t.name for t in first_op.inputs]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + elif dsp_ops.has_op(first_op.type): + op_def.input.extend([t.name for t in first_op.inputs]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + else: + raise Exception('Unsupported op: ', first_op) + + resolved_ops.add(first_op.name) + + del unresolved_ops[0] + +def add_output_node(net_def, output_node): + op_def = net_def.op.add() + op_def.name = '__output__' + op_def.type = 'OUTPUT' + op_def.input.extend([get_tensor_name_from_op(output_node, 0)]) + +def reverse_batch_to_space_and_biasadd(net_def): + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + new_ops = [] + skip_ops = set() + visited_ops = set() + + for op in net_def.op: + if op.name in visited_ops: + pass + # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R + success = False + if op.type == 'Requantize_32to8': + biasadd_requantize_op = op + biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0]) + if biasadd_op.type == 'QuantizedBiasAdd_8p8to32': + b2s_op = get_node_from_map(op_map, biasadd_op.input[0]) + if b2s_op.type == 'QuantizedBatchToSpaceND_8': + conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0]) + conv_op = get_node_from_map(op_map, conv_requantize_op.input[0]) + if conv_op.type == 'QuantizedConv2d_8x8to32': + new_biasadd_op = mace_pb2.OperatorDef() + new_biasadd_op.CopyFrom(biasadd_op) + new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0) + new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1) + new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2) + new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4 + + new_biasadd_requantize_op = mace_pb2.OperatorDef() + new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op) + new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4 + + new_b2s_op = mace_pb2.OperatorDef() + new_b2s_op.CopyFrom(b2s_op) + new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0) + new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1) + new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2) + + new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op]) + skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name]) + visited_ops.add(op.name) + + follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)] + for follow_op in follow_ops: + new_follow_op = mace_pb2.OperatorDef() + new_follow_op.CopyFrom(follow_op) + for i in xrange(len(follow_op.input)): + for k in xrange(3): + if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): + new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) + new_ops.append(new_follow_op) + skip_ops.add(follow_op.name) + visited_ops.add(follow_op.name) + + visited_ops.add(op.name) + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend(tensor_map.values()) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + + return new_net_def + +def add_node_id(net_def): + node_id_counter = 0 + node_id_map = {} + for tensor in net_def.tensors: + tensor.node_id = node_id_counter + node_id_counter += 1 + tensor_op, port = get_op_and_port_from_tensor(tensor.name) + node_id_map[tensor_op] = tensor.node_id + + for op in net_def.op: + op.node_id = node_id_counter + node_id_counter += 1 + node_id_map[op.name] = op.node_id + for ipt in op.input: + op_name, port = get_op_and_port_from_tensor(ipt) + node_id = node_id_map[op_name] + node_input = op.node_input.add() + node_input.node_id = node_id + node_input.output_port = int(port) + + return net_def + +def add_input_output_info(net_def, input_node, output_node, graph, dtype): + input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) + output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) + + input_info = net_def.input_info.add() + input_info.dims.extend(input_tensor.shape.as_list()) + input_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): + input_info = net_def.input_info.add() + input_info.dims.extend([1,1,1,1]) + input_info.data_type = mace_pb2.DT_FLOAT + + output_info = net_def.output_info.add() + output_info.dims.extend(output_tensor.shape.as_list()) + output_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): + output_info = net_def.output_info.add() + output_info.dims.extend([1,1,1,1]) + output_info.data_type = mace_pb2.DT_FLOAT + + return net_def + +def fuse_quantize(net_def, input_node, output_node): + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + skip_ops = set() + new_ops = [] + skip_tensors = set() + + # INPUT->Flatten->Minf, Maxf->Quantize + for op in net_def.op: + if op.type == 'INPUT': + input_op = op + flatten_op = None + quantize_op = None + for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: + if o.type == 'Flatten': + flatten_op = o + elif o.type == 'Quantize': + quantize_op = o + if quantize_op is not None: + minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)] + skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name]) + skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) + quantize_op.type = 'AutoQuantize' + del quantize_op.input[1:] + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + return new_net_def + +def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode): + """ + nnlib does not have batch norm, so use tensorflow optimizer to fold + batch norm with convolution. The fold optimization reorders ops, so + we sort ops first by topology. + """ + input_graph_def = tf.GraphDef() + with gfile.Open(model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) + + input_graph_def = graph_util.sort_tf_graph(input_graph_def) + net_def = mace_pb2.NetDef() + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + ops = graph.get_operations() + dsp_ops = DspOps() + resolved_ops = set() + # convert const node + unresolved_ops = [op for op in ops if op.type == 'Const'] + while len(unresolved_ops) > 0: + convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) + + # convert op node + unresolved_ops = [op for op in ops if op.type != 'Const'] + while len(unresolved_ops) > 0: + convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) + + add_output_node(net_def, output_node) + net_def = reverse_batch_to_space_and_biasadd(net_def) + net_def = fuse_quantize(net_def, input_node, output_node) + + sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') + net_def_with_node_id = add_node_id(sorted_net_def) + + dtype = mace_pb2.DT_FLOAT + final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype) + + arg = final_net_def.arg.add() + arg.name = 'dsp_mode' + arg.i = dsp_mode + + return final_net_def + diff --git a/tools/benchmark.sh b/tools/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..c327825546af73ed8cc8214a94b94b5d82cb46b0 --- /dev/null +++ b/tools/benchmark.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +Usage() { + echo "Usage: bash tools/benchmark.sh model_output_dir" +} + +if [ $# -lt 1 ]; then + Usage + exit 1 +fi + +CURRENT_DIR=`dirname $0` +source ${CURRENT_DIR}/env.sh + +MODEL_OUTPUT_DIR=$1 + +if [ -f "$MODEL_OUTPUT_DIR/benchmark_model" ]; then + rm -rf $MODEL_OUTPUT_DIR/benchmark_model +fi + +if [ "$EMBED_MODEL_DATA" = 0 ]; then + cp codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR +fi + +if [ x"$TARGET_ABI" == x"host" ]; then + bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \ + --copt="-std=c++11" \ + --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ + --copt="-Werror=return-type" \ + --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ + --copt="-O3" \ + --define openmp=true \ + --define production=true || exit 1 + + cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR + + MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \ + ${MODEL_OUTPUT_DIR}/benchmark_model \ + --model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \ + --device=${DEVICE_TYPE} \ + --input_node="${INPUT_NODES}" \ + --input_shape="${INPUT_SHAPES}"\ + --output_node="${OUTPUT_NODES}" \ + --output_shape="${OUTPUT_SHAPES}"\ + --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} || exit 1 + +else + bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \ + --crosstool_top=//external:android/crosstool \ + --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ + --cpu=${TARGET_ABI} \ + --copt="-std=c++11" \ + --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ + --copt="-Werror=return-type" \ + --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ + --copt="-O3" \ + --define openmp=true \ + --define production=true || exit 1 + + cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR + + adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1 + IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}" + for NAME in "${INPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + adb push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} || exit 1 + done + adb push ${MODEL_OUTPUT_DIR}/benchmark_model ${PHONE_DATA_DIR} || exit 1 + if [ "$EMBED_MODEL_DATA" = 0 ]; then + adb push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1 + fi + + adb /dev/null + adb pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR} > /dev/null + fi +fi + +python mace/python/tools/opencl_codegen.py \ + --cl_binary_dirs=${CL_BIN_DIRS} \ + --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc + +python mace/python/tools/binary_codegen.py \ + --binary_dirs=${CL_BIN_DIRS} \ + --binary_file_name=mace_run.config \ + --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc diff --git a/tools/mace_tools.py b/tools/mace_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..556d77412dc92df2fd0a673c59b7fafcfe0dc90e --- /dev/null +++ b/tools/mace_tools.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python + +# Must run at root dir of libmace project. +# python tools/mace_tools.py \ +# --config=tools/example.yaml \ +# --round=100 \ +# --mode=all + +import argparse +import hashlib +import os +import shutil +import subprocess +import sys +import urllib +import yaml + +from ConfigParser import ConfigParser + +def run_command(command): + print("Run command: {}".format(command)) + result = subprocess.Popen( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = result.communicate() + + if out: + print("Stdout msg:\n{}".format(out)) + if err: + print("Stderr msg:\n{}".format(err)) + + if result.returncode != 0: + raise Exception("Exit not 0 from bash with code: {}, command: {}".format( + result.returncode, command)) + + +def get_global_runtime(configs): + runtime_list = [] + for model_name in configs["models"]: + model_runtime = configs["models"][model_name]["runtime"] + runtime_list.append(model_runtime.lower()) + + global_runtime = "" + if "dsp" in runtime_list: + global_runtime = "dsp" + elif "gpu" in runtime_list: + global_runtime = "gpu" + elif "cpu" in runtime_list: + global_runtime = "cpu" + else: + raise Exception("Not found available RUNTIME in config files!") + + return global_runtime + + +def generate_opencl_and_version_code(): + command = "bash tools/generate_opencl_and_version_code.sh" + run_command(command) + + +def clear_env(): + command = "bash tools/clear_env.sh" + run_command(command) + + +def generate_random_input(model_output_dir): + generate_data_or_not = True + command = "bash tools/validate_tools.sh {} {}".format( + model_output_dir, int(generate_data_or_not)) + run_command(command) + + +def generate_model_code(): + command = "bash tools/generate_model_code.sh" + run_command(command) + + +def build_mace_run(production_mode, model_output_dir, hexagon_mode): + command = "bash tools/build_mace_run.sh {} {} {}".format( + int(production_mode), model_output_dir, int(hexagon_mode)) + run_command(command) + + +def tuning_run(model_output_dir, running_round, tuning, production_mode, restart_round): + command = "bash tools/tuning_run.sh {} {} {} {} {}".format( + model_output_dir, running_round, int(tuning), int(production_mode), restart_round) + run_command(command) + + +def benchmark_model(model_output_dir): + command = "bash tools/benchmark.sh {}".format(model_output_dir) + run_command(command) + + +def run_model(model_output_dir, running_round, restart_round): + tuning_run(model_output_dir, running_round, False, False, restart_round) + + +def generate_production_code(model_output_dirs, pull_or_not): + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + command = "bash tools/generate_production_code.sh {} {}".format( + cl_bin_dirs_str, int(pull_or_not)) + run_command(command) + + +def build_mace_run_prod(model_output_dir, tuning, global_runtime): + if "dsp" == global_runtime: + hexagon_mode = True + else: + hexagon_mode = False + + production_or_not = False + build_mace_run(production_or_not, model_output_dir, hexagon_mode) + tuning_run( + model_output_dir, + running_round=0, + tuning=tuning, + production_mode=production_or_not, + restart_round=1) + + production_or_not = True + pull_or_not = True + generate_production_code([model_output_dir], pull_or_not) + build_mace_run(production_or_not, model_output_dir, hexagon_mode) + + +def build_run_throughput_test(run_seconds, merged_lib_file, model_input_dir): + command = "bash tools/build_run_throughput_test.sh {} {} {}".format( + run_seconds, merged_lib_file, model_input_dir) + run_command(command) + + +def validate_model(model_output_dir): + generate_data_or_not = False + command = "bash tools/validate_tools.sh {} {}".format( + model_output_dir, int(generate_data_or_not)) + run_command(command) + + +def build_production_code(): + command = "bash tools/build_production_code.sh" + run_command(command) + + +def merge_libs_and_tuning_results(output_dir, model_output_dirs): + pull_or_not = False + generate_production_code(model_output_dirs, pull_or_not) + build_production_code() + + model_output_dirs_str = ",".join(model_output_dirs) + command = "bash tools/merge_libs.sh {} {}".format(output_dir, + model_output_dirs_str) + run_command(command) + + +def parse_model_configs(): + with open(FLAGS.config) as f: + configs = yaml.load(f) + return configs + + +def parse_args(): + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--config", + type=str, + default="./tool/config", + help="The global config file of models.") + parser.add_argument( + "--output_dir", type=str, default="build", help="The output dir.") + parser.add_argument( + "--round", type=int, default=1, help="The model running round.") + parser.add_argument("--run_seconds", type=int, default=10, + help="The model throughput test running seconds.") + parser.add_argument( + "--restart_round", type=int, default=1, help="The model restart round.") + parser.add_argument( + "--tuning", type="bool", default="true", help="Tune opencl params.") + parser.add_argument("--mode", type=str, default="all", + help="[build|run|validate|merge|all|throughput_test].") + return parser.parse_known_args() + + +def main(unused_args): + configs = parse_model_configs() + + if FLAGS.mode == "build" or FLAGS.mode == "all": + # Remove previous output dirs + if not os.path.exists(FLAGS.output_dir): + os.makedirs(FLAGS.output_dir) + elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")): + shutil.rmtree(os.path.join(FLAGS.output_dir, "libmace")) + + if FLAGS.mode == "validate": + FLAGS.round = 1 + FLAGS.restart_round = 1 + + os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"]) + os.environ["VLOG_LEVEL"] = str(configs["vlog_level"]) + os.environ["PROJECT_NAME"] = os.path.splitext(os.path.basename(FLAGS.config))[0] + + generate_opencl_and_version_code() + + for target_abi in configs["target_abis"]: + global_runtime = get_global_runtime(configs) + # Transfer params by environment + os.environ["TARGET_ABI"] = target_abi + model_output_dirs = [] + for model_name in configs["models"]: + # Transfer params by environment + os.environ["MODEL_TAG"] = model_name + print '=======================', model_name, '=======================' + model_config = configs["models"][model_name] + for key in model_config: + if key in ['input_nodes', 'output_nodes'] and isinstance(model_config[key], list): + os.environ[key.upper()] = ",".join(model_config[key]) + elif key in ['input_shapes', 'output_shapes'] and isinstance(model_config[key], list): + os.environ[key.upper()] = ":".join(model_config[key]) + else: + os.environ[key.upper()] = str(model_config[key]) + + md5 = hashlib.md5() + md5.update(model_config["model_file_path"]) + model_path_digest = md5.hexdigest() + model_output_dir = "%s/%s/%s/%s" % (FLAGS.output_dir, model_name, model_path_digest, target_abi) + model_output_dirs.append(model_output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + if os.path.exists(model_output_dir): + shutil.rmtree(model_output_dir) + os.makedirs(model_output_dir) + clear_env() + + # Support http:// and https:// + if model_config["model_file_path"].startswith( + "http://") or model_config["model_file_path"].startswith("https://"): + os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" + urllib.urlretrieve(model_config["model_file_path"], os.environ["MODEL_FILE_PATH"]) + + if model_config["platform"] == "caffe" and (model_config["weight_file_path"].startswith( + "http://") or model_config["weight_file_path"].startswith("https://")): + os.environ["WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" + urllib.urlretrieve(model_config["weight_file_path"], os.environ["WEIGHT_FILE_PATH"]) + + if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\ + or FLAGS.mode == "benchmark" or FLAGS.mode == "all": + generate_random_input(model_output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + generate_model_code() + build_mace_run_prod(model_output_dir, FLAGS.tuning, global_runtime) + + if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all": + run_model(model_output_dir, FLAGS.round, FLAGS.restart_round) + + if FLAGS.mode == "benchmark": + benchmark_model(model_output_dir) + + if FLAGS.mode == "validate" or FLAGS.mode == "all": + validate_model(model_output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "merge" or FLAGS.mode == "all": + merge_libs_and_tuning_results(FLAGS.output_dir + "/" + target_abi, + model_output_dirs) + + if FLAGS.mode == "throughput_test": + merged_lib_file = FLAGS.output_dir + "/%s/libmace/lib/libmace_%s.a" % \ + (configs["target_abis"][0], os.environ["PROJECT_NAME"]) + generate_random_input(FLAGS.output_dir) + for model_name in configs["models"]: + runtime = configs["models"][model_name]["runtime"] + os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name + build_run_throughput_test(FLAGS.run_seconds, merged_lib_file, FLAGS.output_dir) + + +if __name__ == "__main__": + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/merge_libs.sh b/tools/merge_libs.sh new file mode 100644 index 0000000000000000000000000000000000000000..c872e8617436df12f4968936f4e9e163958de94f --- /dev/null +++ b/tools/merge_libs.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +Usage() { + echo "Usage: bash tools/merge_libs.sh libmace_output_dir model_output_dirs" +} + +if [ $# -lt 2 ]; then + Usage + exit 1 +fi + +CURRENT_DIR=`dirname $0` +source ${CURRENT_DIR}/env.sh + +LIBMACE_BUILD_DIR=$1 +MODEL_OUTPUT_DIRS=$2 +MODEL_OUTPUT_DIRS_ARR=(${MODEL_OUTPUT_DIRS//,/ }) +MODEL_HEADER_DIR=${LIBMACE_BUILD_DIR}/libmace/include/mace/public +MODEL_DATA_DIR=${LIBMACE_BUILD_DIR}/libmace/data + +rm -rf ${LIBMACE_BUILD_DIR}/libmace +mkdir -p ${LIBMACE_BUILD_DIR}/libmace/include/mace/public +mkdir -p ${LIBMACE_BUILD_DIR}/libmace/lib +mkdir -p ${MODEL_DATA_DIR} +cp -rf ${MACE_SOURCE_DIR}/mace/public/*.h ${LIBMACE_BUILD_DIR}/libmace/include/mace/public/ +cp ${MACE_SOURCE_DIR}/mace/core/runtime/hexagon/libhexagon_controller.so ${LIBMACE_BUILD_DIR}/libmace/lib + +LIBMACE_TEMP_DIR=`mktemp -d -t libmace.XXXX` + +# Merge all libraries in to one +echo "create ${LIBMACE_BUILD_DIR}/libmace/lib/libmace_${PROJECT_NAME}.a" > ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + +if [ x"$TARGET_ABI" = x"host" ]; then + echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri +else + echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/codegen/libgenerated_version.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/core/libcore.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/core/libopencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/kernels/libkernels.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/utils/libutils.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + echo "addlib bazel-bin/mace/utils/libutils_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri +fi + +for model_output_dir in ${MODEL_OUTPUT_DIRS_ARR[@]}; do + for lib in ${model_output_dir}/*.a; do + echo "addlib ${lib}" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri + done + for data_file in ${model_output_dir}/*.data; do + cp ${data_file} ${MODEL_DATA_DIR} + done + for header_file in ${model_output_dir}/*.h; do + cp ${header_file} ${MODEL_HEADER_DIR} + done +done +echo "save" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri +echo "end" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri +$ANDROID_NDK_HOME/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-ar \ + -M < ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri || exit 1 + +rm -rf ${LIBMACE_TEMP_DIR} + +echo "Libs merged!" diff --git a/tools/tuning_run.sh b/tools/tuning_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..e8a0f8bc54363b8ba3d02fd18efe4b110263791b --- /dev/null +++ b/tools/tuning_run.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +Usage() { + echo "Usage: bash tools/tuning_run.sh model_output_dir round tuning production_mode" +} + +if [ $# -lt 4 ]; then + Usage + exit 1 +fi + +CURRENT_DIR=`dirname $0` +source ${CURRENT_DIR}/env.sh + +MODEL_OUTPUT_DIR=$1 +ROUND=$2 +TUNING_OR_NOT=$3 +PRODUCTION_MODE=$4 +RESTART_ROUND=$5 + +if [ x"$TARGET_ABI" = x"host" ]; then + MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \ + ${MODEL_OUTPUT_DIR}/mace_run \ + --input_node="${INPUT_NODES}" \ + --input_shape="${INPUT_SHAPES}"\ + --output_node="${OUTPUT_NODES}" \ + --output_shape="${OUTPUT_SHAPES}"\ + --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \ + --output_file=${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME} \ + --model_data_file=${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data \ + --device=${DEVICE_TYPE} \ + --round=1 \ + --restart_round=1 || exit 1 +else + if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != 1 ]];then + tuning_flag=1 + else + tuning_flag=0 + fi + + adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1 + if [ "$PRODUCTION_MODE" = 0 ]; then + adb shell "mkdir -p ${KERNEL_DIR}" || exit 1 + fi + + IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}" + for NAME in "${INPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + adb push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} > /dev/null || exit 1 + done + + adb /dev/null || exit 1 + if [ "$EMBED_MODEL_DATA" = 0 ]; then + adb push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} > /dev/null || exit 1 + fi + adb push mace/core/runtime/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} > /dev/null || exit 1 + + mace_adb_output=`adb 0.999) or \ + (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \ + (FLAGS.mace_runtime == "dsp" and similarity > 0.930): + print '=======================Similarity Test Passed======================' + else: + print '=======================Similarity Test Failed======================' + sys.exit(-1) + else: + print '=======================Skip empty node===================' + sys.exit(-1) + + +def validate_tf_model(input_names, input_shapes, output_names): + import tensorflow as tf + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + + input_graph_def = tf.GraphDef() + with open(FLAGS.model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) + tf.import_graph_def(input_graph_def, name="") + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + input_dict = {} + for i in range(len(input_names)): + input_value = load_data(FLAGS.input_file + "_" + input_names[i]) + input_value = input_value.reshape(input_shapes[i]) + input_node = graph.get_tensor_by_name(input_names[i] + ':0') + input_dict[input_node] = input_value + + output_nodes = [] + for name in output_names: + output_nodes.extend([graph.get_tensor_by_name(name + ':0')]) + output_values = session.run(output_nodes, feed_dict=input_dict) + for i in range(len(output_names)): + output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i]) + mace_out_value = load_data(output_file_name) + compare_output(output_names[i], mace_out_value, output_values[i]) + +def validate_caffe_model(input_names, input_shapes, output_names, output_shapes): + os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints + import caffe + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + if not os.path.isfile(FLAGS.weight_file): + print("Input weight file '" + FLAGS.weight_file + "' does not exist!") + sys.exit(-1) + + caffe.set_mode_cpu() + + net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file) + + for i in range(len(input_names)): + input_value = load_data(FLAGS.input_file + "_" + input_names[i]) + input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2)) + net.blobs[input_names[i]].data[0] = input_value + + net.forward() + + for i in range(len(output_names)): + value = net.blobs[net.top_names[output_names[i]][0]].data[0] + out_shape = output_shapes[i] + out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[1], out_shape[2] + value = value.reshape(out_shape).transpose((0, 2, 3, 1)) + output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i]) + mace_out_value = load_data(output_file_name) + compare_output(output_names[i], mace_out_value, value) + +def main(unused_args): + input_names = [name for name in FLAGS.input_node.split(',')] + input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')] + input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs] + output_names = [name for name in FLAGS.output_node.split(',')] + assert len(input_names) == len(input_shapes) + + if FLAGS.platform == 'tensorflow': + validate_tf_model(input_names, input_shapes, output_names) + elif FLAGS.platform == 'caffe': + output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')] + output_shapes = [[int(x) for x in shape.split(',')] for shape in output_shape_strs] + validate_caffe_model(input_names, input_shapes, output_names, output_shapes) + +def parse_args(): + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--platform", + type=str, + default="", + help="Tensorflow or Caffe.") + parser.add_argument( + "--model_file", + type=str, + default="", + help="TensorFlow or Caffe \'GraphDef\' file to load.") + parser.add_argument( + "--weight_file", + type=str, + default="", + help="caffe model file to load.") + parser.add_argument( + "--input_file", + type=str, + default="", + help="input file.") + parser.add_argument( + "--mace_out_file", + type=str, + default="", + help="mace output file to load.") + parser.add_argument( + "--mace_runtime", + type=str, + default="gpu", + help="mace runtime device.") + parser.add_argument( + "--input_shape", + type=str, + default="1,64,64,3", + help="input shape.") + parser.add_argument( + "--output_shape", + type=str, + default="1,64,64,2", + help="output shape.") + parser.add_argument( + "--input_node", + type=str, + default="input_node", + help="input node") + parser.add_argument( + "--output_node", + type=str, + default="output_node", + help="output node") + + return parser.parse_known_args() + + +if __name__ == '__main__': + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) + diff --git a/tools/validate_tools.sh b/tools/validate_tools.sh new file mode 100644 index 0000000000000000000000000000000000000000..6a458676dcaced33e878da197acc545123e0aeb8 --- /dev/null +++ b/tools/validate_tools.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +Usage() { + echo "Usage: bash tools/validate_tools.sh model_output_dir generate_data_or_not" +} + +if [ $# -lt 2 ]; then + Usage + exit 1 +fi + +CURRENT_DIR=`dirname $0` +source ${CURRENT_DIR}/env.sh + +MODEL_OUTPUT_DIR=$1 +GENERATE_DATA_OR_NOT=$2 + +IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}" +IFS=',' read -r -a OUTPUT_NAMES <<< "${OUTPUT_NODES}" + +echo $MODEL_OUTPUT_DIR +if [ "$GENERATE_DATA_OR_NOT" = 1 ]; then + for NAME in "${INPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + rm -rf ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} + done + python -u tools/generate_data.py --input_node=${INPUT_NODES} \ + --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \ + --input_shape="${INPUT_SHAPES}" || exit 1 + exit 0 +fi + +if [ "$PLATFORM" == "tensorflow" ];then + if [[ x"$TARGET_ABI" != x"host" ]]; then + for NAME in "${OUTPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + rm -rf ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} + adb pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${MODEL_OUTPUT_DIR} > /dev/null + done + fi + python -u tools/validate.py --platform=tensorflow \ + --model_file ${MODEL_FILE_PATH} \ + --input_file ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \ + --mace_out_file ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME} \ + --mace_runtime ${RUNTIME} \ + --input_node ${INPUT_NODES} \ + --output_node ${OUTPUT_NODES} \ + --input_shape ${INPUT_SHAPES} \ + --output_shape ${OUTPUT_SHAPES} || exit 1 + +elif [ "$PLATFORM" == "caffe" ];then + IMAGE_NAME=mace-caffe:latest + CONTAINER_NAME=mace_caffe_validator + RES_FILE=validation.result + + if [[ "$(docker images -q mace-caffe:latest 2> /dev/null)" == "" ]]; then + echo "Build caffe docker" + docker build -t ${IMAGE_NAME} docker/caffe || exit 1 + fi + + if [ ! "$(docker ps -qa -f name=${CONTAINER_NAME})" ]; then + echo "Run caffe container" + docker run -d -it --name ${CONTAINER_NAME} ${IMAGE_NAME} /bin/bash || exit 1 + fi + + if [ "$(docker inspect -f {{.State.Running}} ${CONTAINER_NAME})" == "false" ];then + echo "Start caffe container" + docker start ${CONTAINER_NAME} + fi + + for NAME in "${INPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + docker cp ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${CONTAINER_NAME}:/mace + done + + if [[ x"$TARGET_ABI" != x"host" ]]; then + for NAME in "${OUTPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + rm -rf ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} + adb pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${MODEL_OUTPUT_DIR} > /dev/null + done + fi + for NAME in "${OUTPUT_NAMES[@]}";do + FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME}) + docker cp ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${CONTAINER_NAME}:/mace + done + + MODEL_FILE_NAME=$(basename ${MODEL_FILE_PATH}) + WEIGHT_FILE_NAME=$(basename ${WEIGHT_FILE_PATH}) + docker cp tools/validate.py ${CONTAINER_NAME}:/mace + docker cp ${MODEL_FILE_PATH} ${CONTAINER_NAME}:/mace + docker cp ${WEIGHT_FILE_PATH} ${CONTAINER_NAME}:/mace + docker exec -it ${CONTAINER_NAME} python -u /mace/validate.py \ + --platform=caffe \ + --model_file /mace/${MODEL_FILE_NAME} \ + --weight_file /mace/${WEIGHT_FILE_NAME} \ + --input_file /mace/${INPUT_FILE_NAME} \ + --mace_out_file /mace/${OUTPUT_FILE_NAME} \ + --mace_runtime ${RUNTIME} \ + --input_node ${INPUT_NODES} \ + --output_node ${OUTPUT_NODES} \ + --input_shape ${INPUT_SHAPES} \ + --output_shape ${OUTPUT_SHAPES} || exit 1 + +fi