提交 3d426864 编写于 作者: L Liangliang He

Merge branch 'merge_with_libmace' into 'master'

Merge with libmace

See merge request !284
...@@ -56,3 +56,20 @@ android_ndk_repository( ...@@ -56,3 +56,20 @@ android_ndk_repository(
# Android 5.0 # Android 5.0
api_level = 21 api_level = 21
) )
git_repository(
name = "com_github_gflags_gflags",
#tag = "v2.2.0",
commit = "30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e", # v2.2.0 + fix of include path
remote = "https://github.com/gflags/gflags.git"
)
bind(
name = "gflags",
actual = "@com_github_gflags_gflags//:gflags",
)
bind(
name = "gflags_nothreads",
actual = "@com_github_gflags_gflags//:gflags_nothreads",
)
FROM ubuntu:16.04
# Update source
# Looks like mirrors.163.com does not work in xiaomi network
# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list
RUN apt-get update -y
## Basic tools
RUN apt-get install -y --no-install-recommends apt-utils
RUN apt-get install -y --no-install-recommends \
build-essential \
cmake \
curl \
git \
libcurl3-dev \
libgoogle-glog-dev \
libfreetype6-dev \
libpng12-dev \
libprotobuf-dev \
libzmq3-dev \
pkg-config \
python-dev \
python-pip \
protobuf-compiler \
rsync \
software-properties-common \
unzip \
zip \
zlib1g-dev \
openjdk-8-jdk \
openjdk-8-jre-headless \
openssh-server \
wget
RUN pip install --upgrade pip
ENV ANDROID_NDK_HOME /opt/android-ndk
ENV ANDROID_NDK /opt/android-ndk
ENV ANDROID_NDK_VERSION r15c
# Android NDK
# download
RUN mkdir /opt/android-ndk-tmp && \
cd /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \
# uncompress
unzip -q android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \
# move to its final location
mv ./android-ndk-${ANDROID_NDK_VERSION} ${ANDROID_NDK_HOME} && \
# remove temp dir
cd ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
# add to PATH
ENV PATH ${PATH}:${ANDROID_NDK_HOME}
# Set up Bazel.
# Running bazel inside a `docker build` command causes trouble, cf:
# https://github.com/bazelbuild/bazel/issues/134
# The easiest solution is to set up a bazelrc file forcing --batch.
RUN echo "startup --batch" >>/etc/bazel.bazelrc
# Similarly, we need to workaround sandboxing issues:
# https://github.com/bazelbuild/bazel/issues/418
RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
>>/etc/bazel.bazelrc
# Install the most recent bazel release.
ENV BAZEL_VERSION 0.7.0
WORKDIR /
RUN mkdir /bazel && \
cd /bazel && \
wget https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
chmod +x bazel-*.sh && \
./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
cd / && \
rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
# Setup vim
RUN apt-get update -y
RUN apt-get install -y --no-install-recommends \
locales \
vim
RUN mkdir -p ~/.vim/autoload ~/.vim/bundle && \
curl -LSso ~/.vim/autoload/pathogen.vim https://tpo.pe/pathogen.vim
RUN mkdir -p ~/.vim/bundle
RUN cd ~/.vim/bundle && \
git clone https://github.com/scrooloose/nerdtree.git && \
git clone https://github.com/vim-syntastic/syntastic.git && \
git clone https://github.com/vim-airline/vim-airline.git && \
git clone https://github.com/altercation/vim-colors-solarized.git && \
git clone https://github.com/bazelbuild/vim-ft-bzl.git && \
git clone https://github.com/google/vim-maktaba.git && \
git clone https://github.com/google/vim-codefmt.git
RUN curl -LSso ~/.vimrc https://gist.githubusercontent.com/llhe/71c5802919debd5825c100c0135478a7/raw/16a35020cc65f9c25d0cf8f11a3ba7b345a1271d/.vimrc
RUN locale-gen en_US.UTF-8
ENV LC_CTYPE=en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV TERM xterm-256color
# Extra packges
RUN apt-get install -y --no-install-recommends \
telnet \
net-tools \
inetutils-ping \
screen \
android-tools-adb
# Install tools
RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools
RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.4.0 \
scipy \
jinja2 \
pyyaml
# Download tensorflow tools
RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \
chmod +x transform_graph
# Install gitlab runner
RUN curl -L https://packages.gitlab.com/install/repositories/runner/gitlab-ci-multi-runner/script.deb.sh | bash
RUN apt-get install gitlab-ci-multi-runner
ENTRYPOINT gitlab-runner run
# 包含mace环境的docker镜像
========
* Login in [小米容器仓库](http://docs.api.xiaomi.net/docker-registry/)
```
docker login cr.d.xiaomi.net
```
* 使用`Dockerfile`编译镜像
```
docker build -t cr.d.xiaomi.net/mace/mace-dev .
```
* 或者从镜像仓库直接pull镜像
```
docker push cr.d.xiaomi.net/mace/mace-dev
```
* 启动容器
```
# Set 'host' network to use ADB
docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
```
FROM ubuntu:16.04
LABEL maintainer caffe-maint@googlegroups.com
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
wget \
libatlas-base-dev \
libboost-all-dev \
libgflags-dev \
libgoogle-glog-dev \
libhdf5-serial-dev \
libleveldb-dev \
liblmdb-dev \
libopencv-dev \
libprotobuf-dev \
libsnappy-dev \
protobuf-compiler \
python-dev \
python-numpy \
python-pip \
python-setuptools \
python-scipy && \
rm -rf /var/lib/apt/lists/*
ENV CAFFE_ROOT=/opt/caffe
WORKDIR $CAFFE_ROOT
# FIXME: use ARG instead of ENV once DockerHub supports this
# https://github.com/docker/hub-feedback/issues/460
ENV CLONE_TAG=1.0
RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
pip install --upgrade pip && \
cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
mkdir build && cd build && \
cmake -DCPU_ONLY=1 .. && \
make -j"$(nproc)"
ENV PYCAFFE_ROOT $CAFFE_ROOT/python
ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
WORKDIR /mace
# Benchmark
# Examples
load(
"//:mace.bzl",
"if_production_mode",
"if_not_production_mode",
"if_hexagon_enabled",
"if_openmp_enabled",
)
licenses(["notice"]) # Apache 2.0
cc_library(
name = "stat_summarizer",
srcs = ["stat_summarizer.cc"],
hdrs = ["stat_summarizer.h"],
linkstatic = 1,
deps = [
"@mace//:mace_headers",
],
)
cc_binary(
name = "benchmark_model",
srcs = [
"benchmark_model.cc",
],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
":stat_summarizer",
"//mace/codegen:generated_models",
"//external:gflags_nothreads",
] + if_hexagon_enabled([
"//lib/hexagon:hexagon",
]) + if_production_mode([
"@mace//:mace_prod",
"//codegen:generated_opencl_prod",
"//codegen:generated_tuning_params",
]) + if_not_production_mode([
"@mace//:mace_dev",
]),
)
cc_library(
name = "libmace_merged",
srcs = [
"libmace_merged.a",
],
visibility = ["//visibility:private"],
)
cc_binary(
name = "model_throughput_test",
srcs = ["model_throughput_test.cc"],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
":libmace_merged",
"//external:gflags_nothreads",
"//lib/hexagon",
"@mace//:mace",
"@mace//:mace_headers",
"@mace//:mace_prod",
],
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
#include "benchmark/stat_summarizer.h"
#include <cstdlib>
#include <fstream>
#include <thread>
#include <numeric>
#include <sys/time.h>
namespace mace {
namespace MACE_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
}
}
namespace mace {
namespace str_util {
std::vector<std::string> Split(const std::string &str, char delims) {
std::vector<std::string> result;
std::string tmp = str;
while (!tmp.empty()) {
size_t next_offset = tmp.find(delims);
result.push_back(tmp.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return result;
}
bool SplitAndParseToInts(const std::string &str,
char delims,
std::vector<int64_t> *result) {
std::string tmp = str;
while (!tmp.empty()) {
int64_t dim = atoi(tmp.data());
result->push_back(dim);
size_t next_offset = tmp.find(delims);
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return true;
}
} // namespace str_util
namespace benchmark {
void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
std::string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
std::string FormatName(const std::string input) {
std::string res = input;
for (size_t i = 0; i < input.size(); ++i) {
if (!::isalnum(res[i])) res[i] = '_';
}
return res;
}
inline int64_t NowMicros() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
bool RunInference(MaceEngine *engine,
const std::vector<mace::MaceInputInfo> &input_infos,
std::map<std::string, float*> &output_infos,
StatSummarizer *summarizer,
int64_t *inference_time_us) {
RunMetadata run_metadata;
RunMetadata *run_metadata_ptr = nullptr;
if (summarizer) {
run_metadata_ptr = &run_metadata;
}
if (input_infos.size() == 1 && output_infos.size() == 1) {
const int64_t start_time = NowMicros();
bool s = engine->Run(input_infos[0].data, input_infos[0].shape,
output_infos.begin()->second, run_metadata_ptr);
const int64_t end_time = NowMicros();
if (!s) {
LOG(ERROR) << "Error during inference.";
return s;
}
*inference_time_us = end_time - start_time;
} else {
const int64_t start_time = NowMicros();
bool s = engine->Run(input_infos, output_infos, run_metadata_ptr);
const int64_t end_time = NowMicros();
if (!s) {
LOG(ERROR) << "Error during inference.";
return s;
}
*inference_time_us = end_time - start_time;
}
if (summarizer != nullptr) {
summarizer->ProcessMetadata(run_metadata);
}
return true;
}
bool Run(MaceEngine *engine,
const std::vector<mace::MaceInputInfo> &input_infos,
std::map<std::string, float*> &output_infos,
StatSummarizer *summarizer,
int num_runs,
double max_time_sec,
int64_t sleep_sec,
int64_t *total_time_us,
int64_t *actual_num_runs) {
*total_time_us = 0;
LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max "
<< max_time_sec << " seconds "
<< (summarizer != nullptr ? "with " : "without ")
<< "detailed stat logging, with " << sleep_sec
<< "s sleep between inferences";
Stat<int64_t> stat;
bool util_max_time = (num_runs <= 0);
for (int i = 0; util_max_time || i < num_runs; ++i) {
int64_t inference_time_us = 0;
bool s = RunInference(engine, input_infos, output_infos, summarizer, &inference_time_us);
stat.UpdateStat(inference_time_us);
(*total_time_us) += inference_time_us;
++(*actual_num_runs);
if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
break;
}
if (!s) {
LOG(INFO) << "Failed on run " << i;
return s;
}
if (sleep_sec > 0) {
std::this_thread::sleep_for(std::chrono::seconds(sleep_sec));
}
}
std::stringstream stream;
stat.OutputToStream(&stream);
LOG(INFO) << stream.str();
return true;
}
DEFINE_string(device, "CPU", "Device [CPU|OPENCL]");
DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma");
DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma");
DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
DEFINE_string(input_file, "", "input file name");
DEFINE_int32(max_num_runs, 100, "number of runs max");
DEFINE_string(max_time, "10.0", "length to run max");
DEFINE_string(inference_delay, "-1", "delay between runs in seconds");
DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds");
DEFINE_string(benchmark_name, "", "benchmark name");
DEFINE_bool(show_run_order, true, "whether to list stats by run order");
DEFINE_int32(run_order_limit, 0, "how many items to show by run order");
DEFINE_bool(show_time, true, "whether to list stats by time taken");
DEFINE_int32(time_limit, 10, "how many items to show by time taken");
DEFINE_bool(show_memory, false, "whether to list stats by memory used");
DEFINE_int32(memory_limit, 10, "how many items to show by memory used");
DEFINE_bool(show_type, true, "whether to list stats by op type");
DEFINE_bool(show_summary, true, "whether to show a summary of the stats");
DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs");
DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0");
int Main(int argc, char **argv) {
MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP.");
gflags::SetUsageMessage("some usage message");
gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]";
LOG(INFO) << "Device: [" << FLAGS_device << "]";
LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]";
LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]";
LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]";
LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]";
LOG(INFO) << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]";
LOG(INFO) << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]";
const long int inter_inference_sleep_seconds =
std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10);
const long int inter_benchmark_sleep_seconds =
std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10);
const double max_benchmark_time_seconds =
std::strtod(FLAGS_max_time.c_str(), nullptr);
std::unique_ptr<StatSummarizer> stats;
StatSummarizerOptions stats_options;
stats_options.show_run_order = FLAGS_show_run_order;
stats_options.run_order_limit = FLAGS_run_order_limit;
stats_options.show_time = FLAGS_show_time;
stats_options.time_limit = FLAGS_time_limit;
stats_options.show_memory = FLAGS_show_memory;
stats_options.memory_limit = FLAGS_memory_limit;
stats_options.show_type = FLAGS_show_type;
stats_options.show_summary = FLAGS_show_summary;
stats.reset(new StatSummarizer(stats_options));
DeviceType device_type = CPU;
if(FLAGS_device == "OPENCL") {
device_type = OPENCL;
}
std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
std::vector<std::string> output_names = str_util::Split(FLAGS_output_node, ',');
std::vector<std::string> input_shapes = str_util::Split(FLAGS_input_shape, ':');
std::vector<std::string> output_shapes = str_util::Split(FLAGS_output_shape, ':');
const size_t input_count = input_shapes.size();
const size_t output_count = output_shapes.size();
std::vector<std::vector<int64_t>> input_shape_vec(input_count);
std::vector<std::vector<int64_t>> output_shape_vec(output_count);
for (size_t i = 0; i < input_count; ++i) {
ParseShape(input_shapes[i], &input_shape_vec[i]);
}
for (size_t i = 0; i < output_count; ++i) {
ParseShape(output_shapes[i], &output_shape_vec[i]);
}
const unsigned char *model_data =
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
std::vector<mace::MaceInputInfo> input_infos(input_count);
std::map<std::string, float*> output_infos;
std::vector<std::unique_ptr<float[]>> input_datas(input_count);
std::vector<std::unique_ptr<float[]>> output_datas(output_count);
for (size_t i = 0; i < input_count; ++i) {
int64_t input_size = std::accumulate(input_shape_vec[i].begin(),
input_shape_vec[i].end(), 1, std::multiplies<int64_t>());
input_datas[i].reset(new float[input_size]);
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
input_infos[i].name = input_names[i];
input_infos[i].shape = input_shape_vec[i];
input_infos[i].data = input_datas[i].get();
}
for (size_t i = 0; i < output_count; ++i) {
int64_t output_size = std::accumulate(output_shape_vec[i].begin(),
output_shape_vec[i].end(), 1, std::multiplies<int64_t>());
output_datas[i].reset(new float[output_size]);
output_infos[output_names[i]] = output_datas[i].get();
}
// Init model
LOG(INFO) << "Run init";
std::unique_ptr<mace::MaceEngine> engine_ptr;
if (input_count == 1 && output_count == 1) {
engine_ptr.reset(new mace::MaceEngine(&net_def, device_type));
} else {
engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, input_names, output_names));
}
if (device_type == DeviceType::OPENCL) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
LOG(INFO) << "Warm up";
int64_t warmup_time_us = 0;
int64_t num_warmup_runs = 0;
if (FLAGS_warmup_runs > 0) {
bool status =
Run(engine_ptr.get(), input_infos, output_infos, nullptr, FLAGS_warmup_runs, -1.0,
inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs);
if (!status) {
LOG(ERROR) << "Failed at warm up run";
}
}
if (inter_benchmark_sleep_seconds > 0) {
std::this_thread::sleep_for(
std::chrono::seconds(inter_benchmark_sleep_seconds));
}
int64_t no_stat_time_us = 0;
int64_t no_stat_runs = 0;
bool status =
Run(engine_ptr.get(), input_infos, output_infos,
nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds,
inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs);
if (!status) {
LOG(ERROR) << "Failed at normal no-stat run";
}
int64_t stat_time_us = 0;
int64_t stat_runs = 0;
status = Run(engine_ptr.get(), input_infos, output_infos,
stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds,
inter_inference_sleep_seconds, &stat_time_us, &stat_runs);
if (!status) {
LOG(ERROR) << "Failed at normal stat run";
}
LOG(INFO) << "Average inference timings in us: "
<< "Warmup: "
<< (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", "
<< "no stats: " << no_stat_time_us / no_stat_runs << ", "
<< "with stats: " << stat_time_us / stat_runs;
stats->PrintOperatorStats();
return 0;
}
} // namespace benchmark
} // namespace mace
int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
/**
* Usage:
* throughput_test \
* --input_shape=1,224,224,3 \
* --output_shape=1,224,224,2 \
* --input_file=input_data \
* --cpu_model_data_file=cpu_model_data.data \
* --gpu_model_data_file=gpu_model_data.data \
* --dsp_model_data_file=dsp_model_data.data \
* --run_seconds=10
*/
#include <malloc.h>
#include <stdint.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <thread>
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
using namespace std;
using namespace mace;
namespace mace {
#ifdef MACE_CPU_MODEL_TAG
namespace MACE_CPU_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace MACE_CPU_MODEL_TAG
#endif
#ifdef MACE_GPU_MODEL_TAG
namespace MACE_GPU_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace MACE_GPU_MODEL_TAG
#endif
#ifdef MACE_DSP_MODEL_TAG
namespace MACE_DSP_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace MACE_DSP_MODEL_TAG
#endif
} // namespace mace
void ParseShape(const string &str, vector<int64_t> *shape) {
string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
DeviceType ParseDeviceType(const string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
return DeviceType::CPU;
}
}
DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
DEFINE_string(input_file, "", "input file name");
DEFINE_string(cpu_model_data_file, "", "cpu model data file name");
DEFINE_string(gpu_model_data_file, "", "gpu model data file name");
DEFINE_string(dsp_model_data_file, "", "dsp model data file name");
DEFINE_int32(run_seconds, 10, "run seconds");
int main(int argc, char **argv) {
gflags::SetUsageMessage("some usage message");
gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "mace version: " << MaceVersion();
LOG(INFO) << "mace git version: " << MaceGitVersion();
#ifdef MACE_CPU_MODEL_TAG
LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum();
#endif
#ifdef MACE_GPU_MODEL_TAG
LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum();
#endif
#ifdef MACE_DSP_MODEL_TAG
LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum();
#endif
LOG(INFO) << "input_shape: " << FLAGS_input_shape;
LOG(INFO) << "output_shape: " << FLAGS_output_shape;
LOG(INFO) << "input_file: " << FLAGS_input_file;
LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file;
LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
vector<int64_t> input_shape_vec;
vector<int64_t> output_shape_vec;
ParseShape(FLAGS_input_shape, &input_shape_vec);
ParseShape(FLAGS_output_shape, &output_shape_vec);
int64_t input_size =
std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1,
std::multiplies<int64_t>());
int64_t output_size =
std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1,
std::multiplies<int64_t>());
std::unique_ptr<float[]> input_data(new float[input_size]);
std::unique_ptr<float[]> cpu_output_data(new float[output_size]);
std::unique_ptr<float[]> gpu_output_data(new float[output_size]);
std::unique_ptr<float[]> dsp_output_data(new float[output_size]);
// load input
ifstream in_file(FLAGS_input_file, ios::in | ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_data.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
int64_t t0, t1, init_micros;
#ifdef MACE_CPU_MODEL_TAG
/* --------------------- CPU init ----------------------- */
LOG(INFO) << "Load & init cpu model and warm up";
const unsigned char *cpu_model_data =
mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str());
NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU);
LOG(INFO) << "CPU Warm up run";
t0 = NowMicros();
cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
t1 = NowMicros();
LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
#endif
#ifdef MACE_GPU_MODEL_TAG
/* --------------------- GPU init ----------------------- */
LOG(INFO) << "Load & init gpu model and warm up";
const unsigned char *gpu_model_data =
mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL);
mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
LOG(INFO) << "GPU Warm up run";
t0 = NowMicros();
gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
t1 = NowMicros();
LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
#endif
#ifdef MACE_DSP_MODEL_TAG
/* --------------------- DSP init ----------------------- */
LOG(INFO) << "Load & init dsp model and warm up";
const unsigned char *dsp_model_data =
mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON);
mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
LOG(INFO) << "DSP Warm up run";
t0 = NowMicros();
gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
t1 = NowMicros();
LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
#endif
double cpu_throughput = 0;
double gpu_throughput = 0;
double dsp_throughput = 0;
int64_t run_micros = FLAGS_run_seconds * 1000000;
#ifdef MACE_CPU_MODEL_TAG
std::thread cpu_thread([&]() {
int64_t frames = 0;
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
int64_t end = NowMicros();
micros = end - start;
}
cpu_throughput = frames * 1000000.0 / micros;
});
#endif
#ifdef MACE_GPU_MODEL_TAG
std::thread gpu_thread([&]() {
int64_t frames = 0;
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
int64_t end = NowMicros();
micros = end - start;
}
gpu_throughput = frames * 1000000.0 / micros;
});
#endif
#ifdef MACE_DSP_MODEL_TAG
std::thread dsp_thread([&]() {
int64_t frames = 0;
int64_t micros = 0;
int64_t start = NowMicros();
for (; micros < run_micros; ++frames) {
dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
int64_t end = NowMicros();
micros = end - start;
}
dsp_throughput = frames * 1000000.0 / micros;
});
#endif
double total_throughput = 0;
#ifdef MACE_CPU_MODEL_TAG
cpu_thread.join();
LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s";
total_throughput += cpu_throughput;
#endif
#ifdef MACE_GPU_MODEL_TAG
gpu_thread.join();
LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s";
total_throughput += gpu_throughput;
#endif
#ifdef MACE_DSP_MODEL_TAG
dsp_thread.join();
LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s";
total_throughput += dsp_throughput;
#endif
LOG(INFO) << "Total throughput: " << total_throughput << " f/s";
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "benchmark/stat_summarizer.h"
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
#include <iomanip>
#include <queue>
#include <iostream>
namespace mace {
StatSummarizer::StatSummarizer(const StatSummarizerOptions &options)
: options_(options) {}
StatSummarizer::~StatSummarizer() {}
void StatSummarizer::Reset() {
run_total_us_.Reset();
memory_.Reset();
details_.clear();
}
void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
int64_t curr_total_us = 0;
int64_t mem_total = 0;
if (run_metadata.op_stats.empty()) {
std::cerr << "Runtime op stats should not be empty" << std::endl;
abort();
}
int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
int node_num = 0;
for (const auto &ops : run_metadata.op_stats) {
std::string name = ops.operator_name;
std::string op_type = ops.type;
++node_num;
const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
curr_total_us += curr_time;
auto result = details_.emplace(name, Detail());
Detail *detail = &(result.first->second);
detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
detail->rel_end_us.UpdateStat(curr_time);
// If this is the first pass, initialize some values.
if (result.second) {
detail->name = name;
detail->type = op_type;
detail->run_order = node_num;
detail->times_called = 0;
}
++detail->times_called;
}
run_total_us_.UpdateStat(curr_total_us);
memory_.UpdateStat(mem_total);
}
std::string StatSummarizer::ShortSummary() const {
std::stringstream stream;
stream << "Timings (microseconds): ";
run_total_us_.OutputToStream(&stream);
stream << std::endl;
stream << "Memory (bytes): ";
memory_.OutputToStream(&stream);
stream << std::endl;
stream << details_.size() << " nodes observed" << std::endl;
return stream.str();
}
std::ostream &InitField(std::ostream &stream, int width) {
stream << "\t" << std::right << std::setw(width) << std::fixed
<< std::setprecision(3);
return stream;
}
std::string StatSummarizer::HeaderString(const std::string &title) const {
std::stringstream stream;
stream << "============================== " << title
<< " ==============================" << std::endl;
InitField(stream, 14) << "[node type]";
InitField(stream, 9) << "[start]";
InitField(stream, 9) << "[first]";
InitField(stream, 9) << "[avg ms]";
InitField(stream, 8) << "[%]";
InitField(stream, 8) << "[cdf%]";
InitField(stream, 10) << "[mem KB]";
InitField(stream, 9) << "[times called]";
stream << "\t"
<< "[Name]";
return stream.str();
}
std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail,
const int64_t cumulative_stat_on_node,
const Stat<int64_t> &stat) const {
const double start_ms = detail.start_us.avg() / 1000.0;
const double first_time_ms = detail.rel_end_us.first() / 1000.0;
const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
const int64_t times_called = detail.times_called / num_runs();
std::stringstream stream;
InitField(stream, 14) << detail.type;
InitField(stream, 9) << start_ms;
InitField(stream, 9) << first_time_ms;
InitField(stream, 9) << avg_time_ms;
InitField(stream, 7) << percentage << "%";
InitField(stream, 7) << cdf_percentage << "%";
InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
InitField(stream, 9) << times_called;
stream << "\t" << detail.name;
return stream.str();
}
void StatSummarizer::OrderNodesByMetric(
SortingMetric metric, std::vector<const Detail *> *details) const {
std::priority_queue<std::pair<std::string, const Detail *>> sorted_list;
const int num_nodes = details_.size();
for (const auto &det : details_) {
const Detail *detail = &(det.second);
std::stringstream stream;
stream << std::setw(20) << std::right << std::setprecision(10)
<< std::fixed;
switch (metric) {
case BY_NAME:
stream << detail->name;
break;
case BY_RUN_ORDER:
stream << num_nodes - detail->run_order;
break;
case BY_TIME:
stream << detail->rel_end_us.avg();
break;
case BY_MEMORY:
stream << detail->mem_used.avg();
break;
case BY_TYPE:
stream << detail->type;
break;
default:
stream << "";
break;
}
sorted_list.emplace(stream.str(), detail);
}
while (!sorted_list.empty()) {
auto entry = sorted_list.top();
sorted_list.pop();
details->push_back(entry.second);
}
}
void StatSummarizer::ComputeStatsByType(
std::map<std::string, int64_t> *node_type_map_count,
std::map<std::string, int64_t> *node_type_map_time,
std::map<std::string, int64_t> *node_type_map_memory,
std::map<std::string, int64_t> *node_type_map_times_called,
int64_t *accumulated_us) const {
int64_t run_count = run_total_us_.count();
for (const auto &det : details_) {
const std::string node_name = det.first;
const Detail &detail = det.second;
int64_t curr_time_val =
static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
*accumulated_us += curr_time_val;
int64_t curr_memory_val = detail.mem_used.newest();
const std::string &node_type = detail.type;
(*node_type_map_count)[node_type] += 1;
(*node_type_map_time)[node_type] += curr_time_val;
(*node_type_map_memory)[node_type] += curr_memory_val;
(*node_type_map_times_called)[node_type] += detail.times_called / run_count;
}
}
std::string StatSummarizer::GetStatsByNodeType() const {
std::stringstream stream;
stream << "============================== Summary by node type "
"=============================="
<< std::endl;
LOG(INFO) << "Number of nodes executed: " << details_.size() << std::endl;
std::map<std::string, int64_t> node_type_map_count;
std::map<std::string, int64_t> node_type_map_time;
std::map<std::string, int64_t> node_type_map_memory;
std::map<std::string, int64_t> node_type_map_times_called;
int64_t accumulated_us = 0;
ComputeStatsByType(&node_type_map_count, &node_type_map_time,
&node_type_map_memory, &node_type_map_times_called,
&accumulated_us);
// Sort them.
std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
timings;
for (const auto &node_type : node_type_map_time) {
const int64_t mem_used = node_type_map_memory[node_type.first];
timings.emplace(node_type.second,
std::pair<std::string, int64_t>(node_type.first, mem_used));
}
InitField(stream, 14) << "[Node type]";
InitField(stream, 9) << "[count]";
InitField(stream, 10) << "[avg ms]";
InitField(stream, 11) << "[avg %]";
InitField(stream, 11) << "[cdf %]";
InitField(stream, 10) << "[mem KB]";
InitField(stream, 10) << "[times called]";
stream << std::endl;
float cdf = 0.0f;
while (!timings.empty()) {
auto entry = timings.top();
timings.pop();
const std::string node_type = entry.second.first;
const float memory = entry.second.second / 1000.0f;
const int64_t node_type_total_us = entry.first;
const float time_per_run_ms = node_type_total_us / 1000.0f;
const float percentage =
((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
cdf += percentage;
InitField(stream, 14) << node_type;
InitField(stream, 9) << node_type_map_count[node_type];
InitField(stream, 10) << time_per_run_ms;
InitField(stream, 10) << percentage << "%";
InitField(stream, 10) << cdf << "%";
InitField(stream, 10) << memory;
InitField(stream, 9) << node_type_map_times_called[node_type];
stream << std::endl;
}
stream << std::endl;
return stream.str();
}
std::string StatSummarizer::GetStatsByMetric(const std::string &title,
SortingMetric sorting_metric,
int num_stats) const {
std::vector<const Detail *> details;
OrderNodesByMetric(sorting_metric, &details);
double cumulative_stat_on_node = 0;
std::stringstream stream;
stream << HeaderString(title) << std::endl;
int stat_num = 0;
for (auto detail : details) {
++stat_num;
if (num_stats > 0 && stat_num > num_stats) {
break;
}
cumulative_stat_on_node += detail->rel_end_us.sum();
stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
<< std::endl;
}
stream << std::endl;
return stream.str();
}
std::string StatSummarizer::GetOutputString() const {
std::stringstream stream;
if (options_.show_run_order) {
stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
options_.run_order_limit);
}
if (options_.show_time) {
stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
options_.time_limit);
}
if (options_.show_memory) {
stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
options_.memory_limit);
}
if (options_.show_type) {
stream << GetStatsByNodeType();
}
if (options_.show_summary) {
stream << ShortSummary() << std::endl;
}
return stream.str();
}
void StatSummarizer::PrintOperatorStats() const {
std::string output = GetOutputString();
std::istringstream iss(output);
for (std::string line; std::getline(iss, line);) {
LOG(INFO) << line;
}
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
#include <stdlib.h>
#include <cmath>
#include <limits>
#include <map>
#include <sstream>
#include <string>
#include <vector>
namespace mace {
class RunMetadata;
template <typename ValueType, typename HighPrecisionValueType = double>
class Stat {
public:
void UpdateStat(ValueType v) {
if (count_ == 0) {
first_ = v;
}
newest_ = v;
max_ = std::max(v, max_);
min_ = std::min(v, min_);
++count_;
sum_ += v;
squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
}
void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
bool empty() const { return count_ == 0; }
ValueType first() const { return first_; }
ValueType newest() const { return newest_; }
ValueType max() const { return max_; }
ValueType min() const { return min_; }
int64_t count() const { return count_; }
ValueType sum() const { return sum_; }
HighPrecisionValueType squared_sum() const { return squared_sum_; }
bool all_same() const { return (count_ == 0 || min_ == max_); }
HighPrecisionValueType avg() const {
return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
: static_cast<HighPrecisionValueType>(sum_) / count_;
}
ValueType std_deviation() const {
return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg());
}
void OutputToStream(std::ostream *stream) const {
if (empty()) {
*stream << "count=0";
} else if (all_same()) {
*stream << "count=" << count_ << " curr=" << newest_;
if (count_ > 1) *stream << "(all same)";
} else {
*stream << "count=" << count_ << " first=" << first_
<< " curr=" << newest_ << " min=" << min_ << " max=" << max_
<< " avg=" << avg() << " std=" << std_deviation();
}
}
friend std::ostream &operator<<(std::ostream &stream,
const Stat<ValueType> &stat) {
stat.OutputToStream(&stream);
return stream;
}
private:
ValueType first_ = 0;
ValueType newest_ = 0;
ValueType max_ = std::numeric_limits<ValueType>::min();
ValueType min_ = std::numeric_limits<ValueType>::max();
int64_t count_ = 0;
ValueType sum_ = 0;
HighPrecisionValueType squared_sum_ = 0;
};
// Used to control the output of the statistics summarizer;
class StatSummarizerOptions {
public:
StatSummarizerOptions()
: show_run_order(true),
run_order_limit(0),
show_time(true),
time_limit(10),
show_memory(true),
memory_limit(10),
show_type(true),
show_summary(true) {}
bool show_run_order;
int run_order_limit;
bool show_time;
int time_limit;
bool show_memory;
int memory_limit;
bool show_type;
bool show_summary;
};
// A StatSummarizer assists in performance analysis of Graph executions.
//
// It summarizes time spent executing (on GPU/CPU), memory used etc. across
// multiple executions of a single Graph from the StepStats collected during
// graph execution.
//
// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
class StatSummarizer {
public:
enum SortingMetric {
BY_NAME,
BY_RUN_ORDER,
BY_TIME,
BY_MEMORY,
BY_TYPE,
};
explicit StatSummarizer(const StatSummarizerOptions &options);
~StatSummarizer();
// Adds another run's StepStats output to the aggregate counts.
void ProcessMetadata(const RunMetadata &run_metadata);
// Returns a string detailing the accumulated runtime stats in a tab-separated
// format which can be pasted into a spreadsheet for further analysis.
std::string GetOutputString() const;
std::string ShortSummary() const;
// Prints the string returned by GetOutputString().
void PrintOperatorStats() const;
void ComputeStatsByType(
std::map<std::string, int64_t> *node_type_map_count,
std::map<std::string, int64_t> *node_type_map_time,
std::map<std::string, int64_t> *node_type_map_memory,
std::map<std::string, int64_t> *node_type_map_times_called,
int64_t *accumulated_us) const;
std::string GetStatsByNodeType() const;
std::string GetStatsByMetric(const std::string &title,
SortingMetric sorting_metric,
int num_stats) const;
void Reset();
// Returns number of runs.
int num_runs() const { return run_total_us_.count(); }
// Returns stats of total microseconds spent by all nodes in each run.
const Stat<int64_t> &run_total_us() const { return run_total_us_; }
private:
struct Detail {
std::string name;
std::string type;
int64_t run_order;
Stat<int64_t> start_us;
Stat<int64_t> rel_end_us;
Stat<int64_t> mem_used;
int64_t times_called;
};
void OrderNodesByMetric(SortingMetric sorting_metric,
std::vector<const Detail *> *details) const;
std::string HeaderString(const std::string &title) const;
std::string ColumnString(const Detail &detail,
const int64_t cumulative_stat_on_node,
const Stat<int64_t> &stat) const;
Stat<int64_t> run_total_us_;
Stat<int64_t> memory_;
std::map<std::string, Detail> details_;
StatSummarizerOptions options_;
};
} // namespace mace
#endif // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
...@@ -104,6 +104,7 @@ cc_library( ...@@ -104,6 +104,7 @@ cc_library(
deps = [ deps = [
":opencl_headers", ":opencl_headers",
"//mace/codegen:generated_opencl_prod", "//mace/codegen:generated_opencl_prod",
"//mace/codegen:generated_tuning_params",
"//mace/utils", "//mace/utils",
], ],
) )
...@@ -24,3 +24,14 @@ cc_test( ...@@ -24,3 +24,14 @@ cc_test(
"//mace/core:test_benchmark_main", "//mace/core:test_benchmark_main",
], ],
) )
cc_binary(
name = "mace_run",
srcs = ["mace_run.cc"],
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
"//mace/codegen:generated_models",
"//external:gflags_nothreads",
],
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
/**
* Usage:
* mace_run --model=mobi_mace.pb \
* --input=input_node \
* --output=output_node \
* --input_shape=1,224,224,3 \
* --output_shape=1,224,224,2 \
* --input_file=input_data \
* --output_file=mace.out \
* --model_data_file=model_data.data \
* --device=OPENCL
*/
#include <malloc.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
using namespace std;
using namespace mace;
namespace mace {
namespace MACE_MODEL_TAG {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace MACE_MODEL_TAG
} // namespace mace
namespace str_util {
std::vector<std::string> Split(const std::string &str, char delims) {
std::vector<std::string> result;
std::string tmp = str;
while (!tmp.empty()) {
size_t next_offset = tmp.find(delims);
result.push_back(tmp.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
return result;
}
} // namespace str_util
void ParseShape(const string &str, vector<int64_t> *shape) {
string tmp = str;
while (!tmp.empty()) {
int dim = atoi(tmp.data());
shape->push_back(dim);
size_t next_offset = tmp.find(",");
if (next_offset == string::npos) {
break;
} else {
tmp = tmp.substr(next_offset + 1);
}
}
}
std::string FormatName(const std::string input) {
std::string res = input;
for (size_t i = 0; i < input.size(); ++i) {
if (!isalnum(res[i])) res[i] = '_';
}
return res;
}
DeviceType ParseDeviceType(const string &device_str) {
if (device_str.compare("CPU") == 0) {
return DeviceType::CPU;
} else if (device_str.compare("NEON") == 0) {
return DeviceType::NEON;
} else if (device_str.compare("OPENCL") == 0) {
return DeviceType::OPENCL;
} else if (device_str.compare("HEXAGON") == 0) {
return DeviceType::HEXAGON;
} else {
return DeviceType::CPU;
}
}
struct mallinfo LogMallinfoChange(struct mallinfo prev) {
struct mallinfo curr = mallinfo();
if (prev.arena != curr.arena) {
LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena
<< ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
}
if (prev.ordblks != curr.ordblks) {
LOG(INFO) << "Number of free chunks: " << curr.ordblks
<< ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
}
if (prev.smblks != curr.smblks) {
LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks
<< ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
}
if (prev.hblks != curr.hblks) {
LOG(INFO) << "Number of mmapped regions: " << curr.hblks
<< ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
}
if (prev.hblkhd != curr.hblkhd) {
LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
<< ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
}
if (prev.usmblks != curr.usmblks) {
LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks
<< ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
}
if (prev.fsmblks != curr.fsmblks) {
LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
<< ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
}
if (prev.uordblks != curr.uordblks) {
LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks
<< ", diff: "
<< ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
}
if (prev.fordblks != curr.fordblks) {
LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: "
<< ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
}
if (prev.keepcost != curr.keepcost) {
LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost
<< ", diff: "
<< ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
}
return curr;
}
DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma");
DEFINE_string(input_shape, "1,224,224,3:1,1,1,10", "input shapes, separated by colon and comma");
DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma");
DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma");
DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs.");
DEFINE_string(output_file, "", "output file name | output file prefix for multiple outputs");
DEFINE_string(model_data_file, "",
"model data file name, used when EMBED_MODEL_DATA set to 0");
DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
bool SingleInputAndOutput(const std::vector<int64_t> &input_shape,
const std::vector<int64_t> &output_shape) {
// load model
int64_t t0 = NowMicros();
const unsigned char *model_data =
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
int64_t t1 = NowMicros();
LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
int64_t init_micros = t1 - t0;
DeviceType device_type = ParseDeviceType(FLAGS_device);
LOG(INFO) << "Runing with device type: " << device_type;
// Init model
LOG(INFO) << "Run init";
t0 = NowMicros();
mace::MaceEngine engine(&net_def, device_type);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
t1 = NowMicros();
init_micros += t1 - t0;
LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
LOG(INFO) << "Total init latency: " << init_micros << " us";
// Allocate input and output
int64_t input_size =
std::accumulate(input_shape.begin(), input_shape.end(), 1,
std::multiplies<int64_t>());
int64_t output_size =
std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int64_t>());
std::unique_ptr<float[]> input_data(new float[input_size]);
std::unique_ptr<float[]> output_data(new float[output_size]);
// load input
ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node), ios::in | ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_data.get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
LOG(INFO) << "Warm up run";
t0 = NowMicros();
engine.Run(input_data.get(), input_shape, output_data.get());
t1 = NowMicros();
LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
if (FLAGS_round > 0) {
LOG(INFO) << "Run model";
t0 = NowMicros();
struct mallinfo prev = mallinfo();
for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(input_data.get(), input_shape, output_data.get());
if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
LOG(INFO) << "=== check malloc info change #" << i << " ===";
prev = LogMallinfoChange(prev);
}
}
t1 = NowMicros();
LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
}
if (FLAGS_restart_round == 1) {
if (output_data != nullptr) {
std::string
output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node);
ofstream out_file(output_name, ios::binary);
out_file.write((const char *) (output_data.get()),
output_size * sizeof(float));
out_file.flush();
out_file.close();
LOG(INFO) << "Write output file "
<< output_name
<< " with size " << output_size
<< " done.";
} else {
LOG(INFO) << "Output data is null";
}
}
return true;
}
bool MultipleInputOrOutput(const std::vector<std::string> &input_names,
const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::string> &output_names,
const std::vector<std::vector<int64_t>> &output_shapes) {
// load model
int64_t t0 = NowMicros();
const unsigned char *model_data =
mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
int64_t t1 = NowMicros();
LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
int64_t init_micros = t1 - t0;
DeviceType device_type = ParseDeviceType(FLAGS_device);
LOG(INFO) << "Runing with device type: " << device_type;
// Init model
LOG(INFO) << "Run init";
t0 = NowMicros();
mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
mace::MACE_MODEL_TAG::UnloadModelData(model_data);
}
t1 = NowMicros();
init_micros += t1 - t0;
LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
LOG(INFO) << "Total init latency: " << init_micros << " us";
const size_t input_count = input_names.size();
const size_t output_count = output_names.size();
std::vector<mace::MaceInputInfo> input_infos(input_count);
std::map<std::string, float*> outputs;
std::vector<std::unique_ptr<float[]>> input_datas(input_count);
for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output
int64_t input_size =
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
std::multiplies<int64_t>());
input_datas[i].reset(new float[input_size]);
// load input
ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), ios::in | ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
input_size * sizeof(float));
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
return -1;
}
input_infos[i].name = input_names[i];
input_infos[i].shape = input_shapes[i];
input_infos[i].data = input_datas[i].get();
}
std::vector<std::unique_ptr<float[]>> output_datas(output_count);
for (size_t i = 0; i < output_count; ++i) {
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>());
output_datas[i].reset(new float[output_size]);
outputs[output_names[i]] = output_datas[i].get();
}
LOG(INFO) << "Warm up run";
t0 = NowMicros();
engine.Run(input_infos, outputs);
t1 = NowMicros();
LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
if (FLAGS_round > 0) {
LOG(INFO) << "Run model";
t0 = NowMicros();
struct mallinfo prev = mallinfo();
for (int i = 0; i < FLAGS_round; ++i) {
engine.Run(input_infos, outputs);
if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
LOG(INFO) << "=== check malloc info change #" << i << " ===";
prev = LogMallinfoChange(prev);
}
}
t1 = NowMicros();
LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
}
for (size_t i = 0; i < output_count; ++i) {
std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]);
ofstream out_file(output_name, ios::binary);
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>());
out_file.write((const char *) outputs[output_names[i]],
output_size * sizeof(float));
out_file.flush();
out_file.close();
LOG(INFO) << "Write output file "
<< output_name
<< " with size " << output_size
<< " done.";
}
return true;
}
int main(int argc, char **argv) {
gflags::SetUsageMessage("some usage message");
gflags::ParseCommandLineFlags(&argc, &argv, true);
LOG(INFO) << "mace version: " << MaceVersion();
LOG(INFO) << "mace git version: " << MaceGitVersion();
LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
LOG(INFO) << "input node: " << FLAGS_input_node;
LOG(INFO) << "input shape: " << FLAGS_input_shape;
LOG(INFO) << "output node: " << FLAGS_output_node;
LOG(INFO) << "output shape: " << FLAGS_output_shape;
LOG(INFO) << "input_file: " << FLAGS_input_file;
LOG(INFO) << "output_file: " << FLAGS_output_file;
LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
LOG(INFO) << "device: " << FLAGS_device;
LOG(INFO) << "round: " << FLAGS_restart_round;
LOG(INFO) << "restart_round: " << FLAGS_round;
std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
std::vector<std::string> output_names = str_util::Split(FLAGS_output_node, ',');
std::vector<std::string> input_shapes = str_util::Split(FLAGS_input_shape, ':');
std::vector<std::string> output_shapes = str_util::Split(FLAGS_output_shape, ':');
const size_t input_count = input_shapes.size();
const size_t output_count = output_shapes.size();
std::vector<vector<int64_t>> input_shape_vec(input_count);
std::vector<vector<int64_t>> output_shape_vec(output_count);
for (size_t i = 0; i < input_count; ++i) {
ParseShape(input_shapes[i], &input_shape_vec[i]);
}
for (size_t i = 0; i < output_count; ++i) {
ParseShape(output_shapes[i], &output_shape_vec[i]);
}
bool ret;
#pragma omp parallel for
for (int i = 0; i < FLAGS_restart_round; ++i) {
VLOG(0) << "restart round " << i;
if (input_count == 1 && output_count == 1) {
ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]);
} else {
ret = MultipleInputOrOutput(input_names,
input_shape_vec,
output_names,
output_shape_vec);
}
}
if(ret) {
return 0;
} else {
return -1;
}
}
...@@ -10,6 +10,15 @@ licenses(["notice"]) # Apache 2.0 ...@@ -10,6 +10,15 @@ licenses(["notice"]) # Apache 2.0
load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
py_proto_library(
name = "mace_py",
srcs = ["mace.proto"],
default_runtime = "@com_google_protobuf//:protobuf_python",
protoc = "@com_google_protobuf//:protoc",
srcs_version = "PY2AND3",
deps = ["@com_google_protobuf//:protobuf_python"],
)
py_proto_library( py_proto_library(
name = "caffe_py", name = "caffe_py",
srcs = ["caffe.proto"], srcs = ["caffe.proto"],
......
...@@ -98,7 +98,7 @@ message NetParameter { ...@@ -98,7 +98,7 @@ message NetParameter {
// NOTE // NOTE
// Update the next available ID when you add a new SolverParameter field. // Update the next available ID when you add a new SolverParameter field.
// //
// SolverParameter next available ID: 41 (last added: type) // SolverParameter next available ID: 43 (last added: weights)
message SolverParameter { message SolverParameter {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Specifying the train and test networks // Specifying the train and test networks
...@@ -128,8 +128,7 @@ message SolverParameter { ...@@ -128,8 +128,7 @@ message SolverParameter {
// The states for the train/test nets. Must be unspecified or // The states for the train/test nets. Must be unspecified or
// specified once per net. // specified once per net.
// //
// By default, all states will have solver = true; // By default, train_state will have phase = TRAIN,
// train_state will have phase = TRAIN,
// and all test_state's will have phase = TEST. // and all test_state's will have phase = TEST.
// Other defaults are set according to the NetState defaults. // Other defaults are set according to the NetState defaults.
optional NetState train_state = 26; optional NetState train_state = 26;
...@@ -187,7 +186,11 @@ message SolverParameter { ...@@ -187,7 +186,11 @@ message SolverParameter {
optional float clip_gradients = 35 [default = -1]; optional float clip_gradients = 35 [default = -1];
optional int32 snapshot = 14 [default = 0]; // The snapshot interval optional int32 snapshot = 14 [default = 0]; // The snapshot interval
optional string snapshot_prefix = 15; // The prefix for the snapshot. // The prefix for the snapshot.
// If not set then is replaced by prototxt file path without extention.
// If is set to directory then is augmented by prototxt file name
// without extention.
optional string snapshot_prefix = 15;
// whether to snapshot diff in the results or not. Snapshotting diff will help // whether to snapshot diff in the results or not. Snapshotting diff will help
// debugging but the final protocol buffer size will be much larger. // debugging but the final protocol buffer size will be much larger.
optional bool snapshot_diff = 16 [default = false]; optional bool snapshot_diff = 16 [default = false];
...@@ -219,7 +222,7 @@ message SolverParameter { ...@@ -219,7 +222,7 @@ message SolverParameter {
// RMSProp decay value // RMSProp decay value
// MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
optional float rms_decay = 38; optional float rms_decay = 38 [default = 0.99];
// If true, print information about the state of the net that may help with // If true, print information about the state of the net that may help with
// debugging learning problems. // debugging learning problems.
...@@ -239,6 +242,19 @@ message SolverParameter { ...@@ -239,6 +242,19 @@ message SolverParameter {
} }
// DEPRECATED: use type instead of solver_type // DEPRECATED: use type instead of solver_type
optional SolverType solver_type = 30 [default = SGD]; optional SolverType solver_type = 30 [default = SGD];
// Overlap compute and communication for data parallel training
optional bool layer_wise_reduce = 41 [default = true];
// Path to caffemodel file(s) with pretrained weights to initialize finetuning.
// Tha same as command line --weights parameter for caffe train command.
// If command line --weights parameter if specified, it has higher priority
// and owerwrites this one(s).
// If --snapshot command line parameter is specified, this one(s) are ignored.
// If several model files are expected, they can be listed in a one
// weights parameter separated by ',' (like in a command string) or
// in repeated weights parameters separately.
repeated string weights = 42;
} }
// A message that stores the solver snapshots // A message that stores the solver snapshots
...@@ -389,16 +405,12 @@ message LayerParameter { ...@@ -389,16 +405,12 @@ message LayerParameter {
optional PoolingParameter pooling_param = 121; optional PoolingParameter pooling_param = 121;
optional PowerParameter power_param = 122; optional PowerParameter power_param = 122;
optional PReLUParameter prelu_param = 131; optional PReLUParameter prelu_param = 131;
optional PSROIPoolingParameter psroi_pooling_param = 149;
optional PSROIAlignParameter psroi_align_param = 1490;
optional PythonParameter python_param = 130; optional PythonParameter python_param = 130;
optional RecurrentParameter recurrent_param = 146; optional RecurrentParameter recurrent_param = 146;
optional ReductionParameter reduction_param = 136; optional ReductionParameter reduction_param = 136;
optional ReLUParameter relu_param = 123; optional ReLUParameter relu_param = 123;
optional ReshapeParameter reshape_param = 133; optional ReshapeParameter reshape_param = 133;
optional ROIPoolingParameter roi_pooling_param = 8266711;
optional ScaleParameter scale_param = 142; optional ScaleParameter scale_param = 142;
optional ProposalParameter proposal_param = 8266713;
optional SigmoidParameter sigmoid_param = 124; optional SigmoidParameter sigmoid_param = 124;
optional SoftmaxParameter softmax_param = 125; optional SoftmaxParameter softmax_param = 125;
optional SPPParameter spp_param = 132; optional SPPParameter spp_param = 132;
...@@ -407,8 +419,6 @@ message LayerParameter { ...@@ -407,8 +419,6 @@ message LayerParameter {
optional ThresholdParameter threshold_param = 128; optional ThresholdParameter threshold_param = 128;
optional TileParameter tile_param = 138; optional TileParameter tile_param = 138;
optional WindowDataParameter window_data_param = 129; optional WindowDataParameter window_data_param = 129;
optional NNPACKConvolutionParameter nnpack_convolution_param = 204;
} }
// Message that stores parameters used to apply transformation // Message that stores parameters used to apply transformation
...@@ -424,7 +434,7 @@ message TransformationParameter { ...@@ -424,7 +434,7 @@ message TransformationParameter {
optional uint32 crop_size = 3 [default = 0]; optional uint32 crop_size = 3 [default = 0];
// mean_file and mean_value cannot be specified at the same time // mean_file and mean_value cannot be specified at the same time
optional string mean_file = 4; optional string mean_file = 4;
// if specified can be repeated once (would substract it from all the channels) // if specified can be repeated once (would subtract it from all the channels)
// or can be repeated the same number of times as channels // or can be repeated the same number of times as channels
// (would subtract them from the corresponding channel) // (would subtract them from the corresponding channel)
repeated float mean_value = 5; repeated float mean_value = 5;
...@@ -440,7 +450,7 @@ message LossParameter { ...@@ -440,7 +450,7 @@ message LossParameter {
optional int32 ignore_label = 1; optional int32 ignore_label = 1;
// How to normalize the loss for loss layers that aggregate across batches, // How to normalize the loss for loss layers that aggregate across batches,
// spatial dimensions, or other dimensions. Currently only implemented in // spatial dimensions, or other dimensions. Currently only implemented in
// SoftmaxWithLoss layer. // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
enum NormalizationMode { enum NormalizationMode {
// Divide by the number of examples in the batch times spatial dimensions. // Divide by the number of examples in the batch times spatial dimensions.
// Outputs that receive the ignore label will NOT be ignored in computing // Outputs that receive the ignore label will NOT be ignored in computing
...@@ -454,6 +464,8 @@ message LossParameter { ...@@ -454,6 +464,8 @@ message LossParameter {
// Do not normalize the loss. // Do not normalize the loss.
NONE = 3; NONE = 3;
} }
// For historical reasons, the default normalization for
// SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
optional NormalizationMode normalization = 3 [default = VALID]; optional NormalizationMode normalization = 3 [default = VALID];
// Deprecated. Ignored if normalization is specified. If normalization // Deprecated. Ignored if normalization is specified. If normalization
// is not specified, then setting this to false will be equivalent to // is not specified, then setting this to false will be equivalent to
...@@ -504,11 +516,21 @@ message ConcatParameter { ...@@ -504,11 +516,21 @@ message ConcatParameter {
} }
message BatchNormParameter { message BatchNormParameter {
// If false, accumulate global mean/variance values via a moving average. If // If false, normalization is performed over the current mini-batch
// true, use those accumulated values instead of computing mean/variance // and global statistics are accumulated (but not yet used) by a moving
// across the batch. // average.
// If true, those accumulated mean and variance values are used for the
// normalization.
// By default, it is set to false when the network is in the training
// phase and true when the network is in the testing phase.
optional bool use_global_stats = 1; optional bool use_global_stats = 1;
// How much does the moving average decay each iteration? // What fraction of the moving average remains each iteration?
// Smaller values make the moving average decay faster, giving more
// weight to the recent values.
// Each iteration updates the moving average @f$S_{t-1}@f$ with the
// current mean @f$ Y_t @f$ by
// @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
// is the moving_average_fraction parameter.
optional float moving_average_fraction = 2 [default = .999]; optional float moving_average_fraction = 2 [default = .999];
// Small value to add to the variance estimate so that we don't divide by // Small value to add to the variance estimate so that we don't divide by
// zero. // zero.
...@@ -590,7 +612,6 @@ message ConvolutionParameter { ...@@ -590,7 +612,6 @@ message ConvolutionParameter {
DEFAULT = 0; DEFAULT = 0;
CAFFE = 1; CAFFE = 1;
CUDNN = 2; CUDNN = 2;
NNPACK = 3;
} }
optional Engine engine = 15 [default = DEFAULT]; optional Engine engine = 15 [default = DEFAULT];
...@@ -660,8 +681,8 @@ message DataParameter { ...@@ -660,8 +681,8 @@ message DataParameter {
optional bool mirror = 6 [default = false]; optional bool mirror = 6 [default = false];
// Force the encoded image to have 3 color channels // Force the encoded image to have 3 color channels
optional bool force_encoded_color = 9 [default = false]; optional bool force_encoded_color = 9 [default = false];
// Prefetch queue (Number of batches to prefetch to host memory, increase if // Prefetch queue (Increase if data feeding bandwidth varies, within the
// data access bandwidth varies). // limit of device memory for GPU training)
optional uint32 prefetch = 10 [default = 4]; optional uint32 prefetch = 10 [default = 4];
} }
...@@ -808,6 +829,7 @@ message ImageDataParameter { ...@@ -808,6 +829,7 @@ message ImageDataParameter {
message InfogainLossParameter { message InfogainLossParameter {
// Specify the infogain matrix source. // Specify the infogain matrix source.
optional string source = 1; optional string source = 1;
optional int32 axis = 2 [default = 1]; // axis of prob
} }
message InnerProductParameter { message InnerProductParameter {
...@@ -825,13 +847,6 @@ message InnerProductParameter { ...@@ -825,13 +847,6 @@ message InnerProductParameter {
// of the weight matrix. The weight matrix itself is not going to be transposed // of the weight matrix. The weight matrix itself is not going to be transposed
// but rather the transfer flag of operations will be toggled accordingly. // but rather the transfer flag of operations will be toggled accordingly.
optional bool transpose = 6 [default = false]; optional bool transpose = 6 [default = false];
enum Engine {
DEFAULT = 0;
CAFFE = 1;
NNPACK = 2;
}
optional Engine engine = 7 [default = DEFAULT];
} }
message InputParameter { message InputParameter {
...@@ -915,7 +930,6 @@ message PoolingParameter { ...@@ -915,7 +930,6 @@ message PoolingParameter {
DEFAULT = 0; DEFAULT = 0;
CAFFE = 1; CAFFE = 1;
CUDNN = 2; CUDNN = 2;
NNPACK = 3;
} }
optional Engine engine = 11 [default = DEFAULT]; optional Engine engine = 11 [default = DEFAULT];
// If global_pooling then it will pool over the size of the bottom by doing // If global_pooling then it will pool over the size of the bottom by doing
...@@ -930,17 +944,6 @@ message PowerParameter { ...@@ -930,17 +944,6 @@ message PowerParameter {
optional float shift = 3 [default = 0.0]; optional float shift = 3 [default = 0.0];
} }
message PSROIPoolingParameter {
required float spatial_scale = 1;
required int32 output_dim = 2; // output channel number
required int32 group_size = 3; // number of groups to encode position-sensitive score maps
}
message PSROIAlignParameter {
required float spatial_scale = 1;
required int32 output_dim = 2; // output channel number
required int32 group_size = 3; // number of groups to encode position-sensitive score maps
}
message PythonParameter { message PythonParameter {
optional string module = 1; optional string module = 1;
optional string layer = 2; optional string layer = 2;
...@@ -949,9 +952,7 @@ message PythonParameter { ...@@ -949,9 +952,7 @@ message PythonParameter {
// string, dictionary in Python dict format, JSON, etc. You may parse this // string, dictionary in Python dict format, JSON, etc. You may parse this
// string in `setup` method and use it in `forward` and `backward`. // string in `setup` method and use it in `forward` and `backward`.
optional string param_str = 3 [default = '']; optional string param_str = 3 [default = ''];
// Whether this PythonLayer is shared among worker solvers during data parallelism. // DEPRECATED
// If true, each worker solver sequentially run forward from this layer.
// This value should be set true if you are using it as a data layer.
optional bool share_in_parallel = 4 [default = false]; optional bool share_in_parallel = 4 [default = false];
} }
...@@ -1083,17 +1084,6 @@ message ReshapeParameter { ...@@ -1083,17 +1084,6 @@ message ReshapeParameter {
optional int32 num_axes = 3 [default = -1]; optional int32 num_axes = 3 [default = -1];
} }
// Message that stores parameters used by ROIPoolingLayer
message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for equal
// dimensions in height and width or as Y, X pairs.
optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
// Multiplicative spatial scale factor to translate ROI coords from their
// input scale to the scale used when pooling
optional float spatial_scale = 3 [default = 1];
}
message ScaleParameter { message ScaleParameter {
// The first axis of bottom[0] (the first input Blob) along which to apply // The first axis of bottom[0] (the first input Blob) along which to apply
// bottom[1] (the second input Blob). May be negative to index from the end // bottom[1] (the second input Blob). May be negative to index from the end
...@@ -1131,13 +1121,6 @@ message ScaleParameter { ...@@ -1131,13 +1121,6 @@ message ScaleParameter {
optional FillerParameter bias_filler = 5; optional FillerParameter bias_filler = 5;
} }
// Message that stores parameters used by ProposalLayer
message ProposalParameter {
optional uint32 feat_stride = 1 [default = 16];
repeated uint32 scales = 2;
repeated float ratios = 3;
}
message SigmoidParameter { message SigmoidParameter {
enum Engine { enum Engine {
DEFAULT = 0; DEFAULT = 0;
...@@ -1438,22 +1421,6 @@ message PReLUParameter { ...@@ -1438,22 +1421,6 @@ message PReLUParameter {
// Initial value of a_i. Default is a_i=0.25 for all i. // Initial value of a_i. Default is a_i=0.25 for all i.
optional FillerParameter filler = 1; optional FillerParameter filler = 1;
// Whether or not slope paramters are shared across channels. // Whether or not slope parameters are shared across channels.
optional bool channel_shared = 2 [default = false]; optional bool channel_shared = 2 [default = false];
} }
message NNPACKConvolutionParameter {
enum Algorithm {
AUTO = 0;
WINOGRAD = 1;
FFT_16x16 = 2;
FFT_8x8 = 3;
}
optional Algorithm algorithm = 1 [default=AUTO];
enum KernelTransformStrategy {
RECOMPUTE = 0;
REUSE = 1;
}
optional KernelTransformStrategy kernel_transform_strategy = 2 [default=RECOMPUTE];
}
syntax = "proto2";
package mace;
enum NetMode {
INIT = 0;
NORMAL = 1;
}
enum DeviceType {
CPU = 0; // In default, we will use CPU.
NEON = 1;
OPENCL = 2;
}
enum DataType {
DT_INVALID = 0;
// Data types that all computation devices are expected to be
// capable to support.
DT_FLOAT = 1;
DT_DOUBLE = 2;
DT_INT32 = 3;
DT_UINT8 = 4;
DT_INT16 = 5;
DT_INT8 = 6;
DT_STRING = 7;
DT_INT64 = 8;
DT_UINT16 = 9;
DT_BOOL = 10;
DT_HALF = 19;
DT_UINT32 = 22;
}
message TensorProto {
// The dimensions in the tensor.
repeated int64 dims = 1;
optional DataType data_type = 2 [default = DT_FLOAT];
// For float
repeated float float_data = 3 [packed = true];
// For int32, uint8, int8, uint16, int16, bool, and float16
// Note about float16: in storage we will basically convert float16 byte-wise
// to unsigned short and then store them in the int32_data field.
repeated int32 int32_data = 4 [packed = true];
// For bytes
optional bytes byte_data = 5;
// For strings
repeated bytes string_data = 6;
// For double
repeated double double_data = 9 [packed = true];
// For int64
repeated int64 int64_data = 10 [packed = true];
// Optionally, a name for the tensor.
optional string name = 7;
optional uint32 node_id = 100;
}
message Argument {
optional string name = 1;
optional float f = 2;
optional int64 i = 3;
optional bytes s = 4;
repeated float floats = 5;
repeated int64 ints = 6;
repeated bytes strings = 7;
}
// for hexagon mace-nnlib
message NodeInput {
optional int32 node_id = 1;
optional int32 output_port = 2;
}
message OutputShape {
repeated int64 dims = 1;
}
message OperatorDef {
repeated string input = 1;
repeated string output = 2;
optional string name = 3;
optional string type = 4;
repeated Argument arg = 5;
repeated OutputShape output_shape = 6;
repeated DataType output_type = 7;
repeated int32 mem_id = 10;
// for hexagon mace-nnlib
optional uint32 node_id = 100;
optional uint32 op_id = 101;
optional uint32 padding = 102;
repeated NodeInput node_input = 103;
repeated int32 out_max_byte_size = 104; // only support 32-bit len
}
// for memory optimization
message MemoryBlock {
optional int32 mem_id = 1;
optional uint32 x = 2;
optional uint32 y = 3;
}
message MemoryArena {
repeated MemoryBlock mem_block = 1;
}
// for hexagon mace-nnlib
message InputInfo {
optional string name = 1;
optional int32 node_id = 2;
repeated int32 dims = 3;
optional int32 max_byte_size = 4; // only support 32-bit len
optional DataType data_type = 5 [default = DT_FLOAT];
}
message OutputInfo {
optional string name = 1;
optional int32 node_id = 2;
repeated int32 dims = 3;
optional int32 max_byte_size = 4; // only support 32-bit len
optional DataType data_type = 5 [default = DT_FLOAT];
}
message NetDef {
optional string name = 1;
repeated OperatorDef op = 2;
optional string version = 3;
repeated Argument arg = 4;
repeated TensorProto tensors = 5;
// for mem optimization
optional MemoryArena mem_arena = 10;
// for hexagon mace-nnlib
repeated InputInfo input_info = 100;
repeated OutputInfo output_info = 101;
}
py_binary( py_library(
name = "caffe_ops_stats", name = "tf_converter_lib",
srcs = ["caffe_ops_stats.py"], srcs = [
"convert_util.py",
"graph_util.py",
"tf_converter_lib.py",
"tf_dsp_converter_lib.py",
],
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":memory_optimizer",
"//mace/proto:mace_py",
],
)
py_library(
name = "caffe_converter_lib",
srcs = [
"caffe_converter_lib.py",
],
srcs_version = "PY2AND3",
deps = [
":memory_optimizer",
"//mace/proto:caffe_py", "//mace/proto:caffe_py",
], ],
) )
py_library(
name = "source_converter_lib",
srcs = [
"source_converter_lib.py",
],
srcs_version = "PY2AND3",
deps = [
"//mace/proto:mace_py",
],
)
py_binary(
name = "converter",
srcs = ["converter.py"],
srcs_version = "PY2AND3",
deps = [
":tf_converter_lib",
":caffe_converter_lib",
":source_converter_lib",
"@six_archive//:six",
],
)
py_binary(
name = "memory_optimizer",
srcs = ["memory_optimizer.py"],
srcs_version = "PY2AND3",
deps = [
"//mace/proto:mace_py",
],
)
import argparse
import os
import sys
import struct
import jinja2
import numpy as np
# python mace/python/tools/binary_codegen.py \
# --binary_dirs=${BIN_FILE} \
# --binary_file_name=mace_run.config \
# --output_path=${CODE_GEN_PATH} --variable_name=kTuningParamsData
FLAGS = None
def generate_cpp_source():
data_map = {}
for binary_dir in FLAGS.binary_dirs.split(","):
binary_path = os.path.join(binary_dir, FLAGS.binary_file_name)
if not os.path.exists(binary_path):
continue
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8])
print size
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
idx += key_size
params_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
data_map[key] = []
count = params_size / 4
params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size])
for i in params:
data_map[key].append(i)
idx += params_size
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.tmpl').render(
maps = data_map,
data_type = 'unsigned int',
variable_name = FLAGS.variable_name
)
def main(unused_args):
cpp_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_binary_source)
w_file.close()
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--binary_dirs",
type=str,
default="cl_bin0/,cl_bin1/",
help="The binaries file path.")
parser.add_argument(
"--binary_file_name",
type=str,
default="mace_run.config",
help="The binary file name.")
parser.add_argument(
"--output_path",
type=str,
default="",
help="The path of generated C++ source file which contains the binary.")
parser.add_argument(
"--variable_name",
type=str,
default="kTuningParamsData",
help="global variable name.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
此差异已折叠。
from mace.proto import caffe_pb2
import google.protobuf.text_format
import operator
import functools
import argparse
import sys
import six
import os.path
FLAGS = None
def main(unused_args):
if not os.path.isfile(FLAGS.input):
print 'input model file not exist'
return -1
net = caffe_pb2.NetParameter()
with open(FLAGS.input) as f:
google.protobuf.text_format.Merge(str(f.read()), net)
ops = {}
for layer in net.layer:
if layer.type not in ops:
ops[layer.type] = 1
else:
ops[layer.type] += 1
for key, value in sorted(ops.items(), key=operator.itemgetter(1)):
print key, ":", value
def parse_args():
'''Parses command line arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
type=str,
default='',
help='Caffe \'GraphDef\' file to load.')
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
import tensorflow as tf
from mace.proto import mace_pb2
TF_DTYPE_2_MACE_DTYPE_MAP = {
tf.float32: mace_pb2.DT_FLOAT,
tf.double: mace_pb2.DT_DOUBLE,
tf.half: mace_pb2.DT_HALF,
tf.int64: mace_pb2.DT_INT64,
tf.int32: mace_pb2.DT_INT32,
tf.qint32: mace_pb2.DT_INT32,
tf.int16: mace_pb2.DT_INT16,
tf.qint16: mace_pb2.DT_INT16,
tf.int8: mace_pb2.DT_INT8,
tf.qint8: mace_pb2.DT_INT8,
tf.quint16: mace_pb2.DT_UINT16,
tf.uint16: mace_pb2.DT_UINT16,
tf.quint8: mace_pb2.DT_UINT8,
tf.uint8: mace_pb2.DT_UINT8,
tf.string: mace_pb2.DT_STRING,
tf.bool: mace_pb2.DT_BOOL,
}
def tf_dtype_2_mace_dtype(tf_dtype):
mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None)
if not mace_dtype:
raise Exception("Not supported tensorflow dtype: " + tf_dtype)
return mace_dtype
import argparse
import sys
import hashlib
import os.path
from mace.python.tools import source_converter_lib
# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3
FLAGS = None
def file_checksum(fname):
hash_func = hashlib.sha256()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk)
return hash_func.hexdigest()
def main(unused_args):
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
model_checksum = file_checksum(FLAGS.model_file)
if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum:
print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum))
sys.exit(-1)
if FLAGS.platform == 'caffe':
if not os.path.isfile(FLAGS.weight_file):
print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
sys.exit(-1)
weight_checksum = file_checksum(FLAGS.weight_file)
if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum:
print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum))
sys.exit(-1)
if FLAGS.runtime == 'dsp':
print("DSP not support caffe model yet.")
sys.exit(-1)
from mace.python.tools import caffe_converter_lib
output_graph_def = caffe_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow':
if FLAGS.runtime == 'dsp':
from mace.python.tools import tf_dsp_converter_lib
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode)
else:
from mace.python.tools import tf_converter_lib
output_graph_def = tf_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
if FLAGS.output_type == 'source':
source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data)
else:
with open(FLAGS.output, "wb") as f:
f.write(output_graph_def.SerializeToString())
with open(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
print("Model conversion is completed.")
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_file",
type=str,
default="",
help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.")
parser.add_argument(
"--weight_file",
type=str,
default="",
help="Caffe data file to load.")
parser.add_argument(
"--model_checksum",
type=str,
default="",
help="Model file sha256 checksum")
parser.add_argument(
"--weight_checksum",
type=str,
default="",
help="Weight file sha256 checksum")
parser.add_argument(
"--output",
type=str,
default="",
help="File to save the output graph to.")
parser.add_argument(
"--runtime",
type=str,
default="cpu",
help="Runtime: cpu/gpu/dsp")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="e.g., input_node")
parser.add_argument(
"--output_node",
type=str,
default="softmax",
help="e.g., softmax")
parser.add_argument(
"--data_type",
type=str,
default='DT_FLOAT',
help="e.g., DT_HALF/DT_FLOAT")
parser.add_argument(
"--output_type",
type=str,
default="pb",
help="output type: source/pb")
parser.add_argument(
"--template",
type=str,
default="",
help="template path")
parser.add_argument(
"--obfuscate",
type=str2bool,
nargs='?',
const=False,
default=False,
help="obfuscate model names")
parser.add_argument(
"--model_tag",
type=str,
default="",
help="model tag for generated function and namespace")
parser.add_argument(
"--winograd",
type=str2bool,
nargs='?',
const=False,
default=False,
help="open winograd convolution or not")
parser.add_argument(
"--dsp_mode",
type=int,
default=0,
help="dsp run mode, defalut=0")
parser.add_argument(
"--input_shape",
type=str,
default="",
help="input shape.")
parser.add_argument(
"--platform",
type=str,
default="tensorflow",
help="tensorflow/caffe")
parser.add_argument(
"--embed_model_data",
type=str2bool,
default=True,
help="input shape.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
class DspOps(object):
def __init__(self):
self.dsp_ops = {
'INPUT': 'INPUT"',
'OUTPUT': 'OUTPUT',
'NoOp': 'Nop',
'FLATTEN': 'Flatten',
'Identity': 'Nop',
'Placeholder': 'INPUT',
'Const': 'Const',
'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
'QuantizedRelu': 'QuantizedRelu_8',
'QuantizedReluX': 'QuantizedReluX_8',
'QuantizedMaxPool': 'QuantizedMaxPool_8',
'QuantizedAvgPool': 'QuantizedAvgPool_8',
'QuantizedConcat': 'QuantizedConcat_8',
'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8',
'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
'QuantizedSoftmax': 'QuantizedSoftmax_8',
'Min': 'Min_f',
'Max': 'Max_f',
'QuantizeV2': 'Quantize',
'Dequantize': 'Dequantize',
'Softmax': 'Softmax_f',
'Reshape': 'Reshape',
'QuantizedReshape': 'QuantizedReshape',
'Sigmoid': 'Sigmoid_f',
'Slice': 'Slice_f',
'Add': 'Add_f',
'Mul': 'Mul_f',
'Requantize': 'Requantize_32to8',
'RequantizationRange': 'RequantizationRange_32',
'Sub': 'Sub_f',
'Pack': 'Pack_int32',
'StridedSlice': 'StridedSlice_f',
'ExpandDims': 'ExpandDims_f',
'QuantizedMul': 'QuantizedMul_8x8to32',
'QuantizedAdd': 'QuantizedAdd_8p8to32',
'Pad': 'Pad_f',
'SpaceToBatchND': 'SpaceToBatchND_f',
'BatchToSpaceND': 'BatchToSpaceND_f',
'ResizeBilinear': 'ResizeBilinear_f',
'ConcatV2': 'ConcatV2_f',
'Conv2DBackpropInput': 'Deconv_f',
'Tanh': 'Tanh_f',
'Split': 'Split_f',
'Transpose': 'Transpose_f',
'Concat': 'Concat_f',
'AddN': 'AddN_f',
}
def has_op(self, tf_op):
return tf_op in self.dsp_ops
def map_nn_op(self, tf_op):
if tf_op not in self.dsp_ops:
raise Exception('Could not map nn op for: ', tf_op)
return self.dsp_ops[tf_op]
import tensorflow as tf
from mace.proto import mace_pb2
from collections import OrderedDict
def sort_tf_node(node, nodes_map, ordered_nodes_map):
if node.name not in ordered_nodes_map:
for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
continue
input_node = nodes_map[input_node_name]
sort_tf_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node
def sort_tf_graph(graph_def):
nodes_map = {}
ordered_nodes_map = OrderedDict()
for node in graph_def.node:
nodes_map[node.name] = node
for node in graph_def.node:
sort_tf_node(node, nodes_map, ordered_nodes_map)
sorted_graph = tf.GraphDef()
sorted_graph.node.extend([node for node in ordered_nodes_map.values()])
return sorted_graph
def sort_mace_node(node, nodes_map, ordered_nodes_map):
if node.name not in ordered_nodes_map:
for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
continue
input_node = nodes_map[input_node_name]
sort_mace_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node
def sort_mace_graph(graph_def, output_name):
nodes_map = {}
ordered_nodes_map = OrderedDict()
for node in graph_def.op:
nodes_map[node.name] = node
sort_mace_node(nodes_map[output_name], nodes_map, ordered_nodes_map)
sorted_graph = mace_pb2.NetDef()
sorted_graph.tensors.extend(graph_def.tensors)
sorted_graph.op.extend([node for node in ordered_nodes_map.values()])
return sorted_graph
import sys
import operator
from mace.proto import mace_pb2
class MemoryOptimizer(object):
def __init__(self, net_def):
self.net_def = net_def
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
consumers = {}
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for output in op.output:
tensor_name = output
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape):
mem_size = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'GEMM':
mem_size[0] = output_shape[2] * output_shape[3]
mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4)
else:
mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4)
mem_size[1] = output_shape[0] * output_shape[1]
return mem_size
def optimize(self):
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if not op.output_shape:
print('WARNING: There is no output shape information to do memory optimization.')
return
if len(op.output_shape) != len(op.output):
print('WARNING: the number of output shape is not equal to the number of output.')
return
for i in range(len(op.output)):
if len(self.idle_mem) == 0:
# allocate new mem
mem_id = self.total_mem_count
self.total_mem_count += 1
else:
# reuse mem
mem_id = self.idle_mem.pop()
op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id
if mem_id not in self.mem_block:
self.mem_block[mem_id] = [0, 0]
mem_size = self.mem_block[mem_id]
op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims)
mem_size[0] = max(mem_size[0], op_mem_size[0])
mem_size[1] = max(mem_size[1], op_mem_size[1])
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
print mem, self.mem_block[mem]
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
//
// Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
//
#include <vector>
#include <string>
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
namespace mace {
namespace {{tag}} {
{% for tensor in tensors %}
extern void CreateTensor{{ tensor.id }}(std::vector<mace::ConstTensor> &tensors,
const unsigned char *model_data);
{% endfor %}
{% for i in range(net.op|length) %}
extern void CreateOperator{{i}}(mace::OperatorDef &op);
{% endfor %}
} // namespace {{ tag }}
namespace {
{% if net.arg|length != 0 %}
void CreateNetArg(mace::NetDef &net_def) {
net_def.mutable_arg().reserve({{ net.arg|length }});
mace::Argument *arg = nullptr;
{% for arg in net.arg %}
arg = net_def.add_arg();
arg->set_name({{ arg.name|tojson }});
{%- if arg.HasField('f') %}
arg->set_f({{ arg.f }});
{% endif %}
{%- if arg.HasField('i') %}
arg->set_i({{ arg.i }});
{% endif %}
{%- if arg.HasField('s') %}
arg->set_s({{ arg.s|tojson }});
{% endif %}
{% if arg.floats|length != 0 %}
arg->set_floats({ {{ arg.floats|join(', ') }} });
{% endif %}
{% if arg.ints|length != 0 %}
arg->set_ints({ {{ arg.ints|join(', ') }} });
{% endif %}
{% if arg.strings|length != 0 %}
arg->set_strings({ {{ arg.strings|stringfy() }} });
{% endif %}
{% endfor %}
}
{% endif %}
{% if net.output_info | length > 0 %}
void CreateOutputInfo(mace::NetDef &net_def) {
std::vector<std::vector<int>> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} };
std::vector<int> data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} };
std::vector<mace::DataType> data_types({{ net.output_info | length }});
for (int k = 0; k < {{ net.output_info | length }}; ++k) {
data_types[k] = static_cast<mace::DataType>(data_types_int[k]);
}
net_def.mutable_output_info().resize({{ net.output_info | length }});
for (int i = 0; i < {{ net.output_info | length }}; ++i) {
net_def.mutable_output_info()[i].set_data_type(data_types[i]);
net_def.mutable_output_info()[i].set_dims(dims[i]);
}
}
{% endif %}
void CreateOperators(std::vector<mace::OperatorDef> &ops) {
MACE_LATENCY_LOGGER(1, "Create operators");
ops.resize({{ net.op|length }});
{% for i in range(net.op|length) %}
mace::{{tag}}::CreateOperator{{i}}(ops[{{i}}]);
{% endfor %}
}
void CreateTensors(std::vector<mace::ConstTensor> &tensors,
const unsigned char *model_data) {
MACE_LATENCY_LOGGER(1, "Create tensors");
tensors.reserve({{ net.tensors|length }});
{% for tensor in tensors %}
mace::{{tag}}::CreateTensor{{tensor.id}}(tensors, model_data);
{% endfor %}
}
{% if net.mem_arena.mem_block|length != 0 %}
void CreateMemoryArena(mace::MemoryArena &mem_arena) {
std::vector<mace::MemoryBlock> &mem_block = mem_arena.mutable_mem_block();
mem_block.reserve({{ net.mem_arena.mem_block|length }});
{% for mem_blk in net.mem_arena.mem_block %}
mem_block.emplace_back(mace::MemoryBlock({{ mem_blk.mem_id }},
{{mem_blk.x}},
{{mem_blk.y}}));
{% endfor %}
}
{% endif %}
} // namespace
namespace {{tag}} {
NetDef CreateNet(const unsigned char *model_data) {
MACE_LATENCY_LOGGER(1, "Create net {{ net.name }}");
NetDef net_def;
net_def.set_name("{{ net.name}}");
net_def.set_version("{{ net.version }}");
{% if net.arg|length != 0 %}
CreateNetArg(net_def);
{% endif %}
CreateOperators(net_def.mutable_op());
CreateTensors(net_def.mutable_tensors(), model_data);
{% if net.mem_arena.mem_block|length != 0 %}
CreateMemoryArena(net_def.mutable_mem_arena());
{% endif %}
{% if net.output_info | length > 0 %}
CreateOutputInfo(net_def);
{% endif %}
return net_def;
}
const std::string ModelChecksum() {
return {{ model_pb_checksum|tojson }};
}
} // namespace {{tag}}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
//
#include <string>
#include "mace/public/mace.h"
namespace mace {
namespace {{tag}} {
extern const unsigned char *LoadModelData(const char *model_data_file);
extern void UnloadModelData(const unsigned char *model_data);
extern NetDef CreateNet(const unsigned char *model_data);
extern const std::string ModelChecksum();
} // namespace {{ tag }}
} // namespace mace
import argparse
import os
import sys
import numpy as np
import jinja2
# python mace/python/tools/opencl_codegen.py \
# --cl_binary_dirs=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH}
FLAGS = None
def generate_cpp_source():
maps = {}
cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",")
for cl_binary_dir in cl_binary_dir_arr:
if not os.path.exists(cl_binary_dir):
print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!")
for file_name in os.listdir(cl_binary_dir):
file_path = os.path.join(cl_binary_dir, file_name)
if file_path[-4:] == ".bin":
# read binary
f = open(file_path, "rb")
binary_array = np.fromfile(f, dtype=np.uint8)
f.close()
maps[file_name[:-4]] = []
for ele in binary_array:
maps[file_name[:-4]].append(hex(ele))
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.tmpl').render(
maps = maps,
data_type = 'unsigned char',
variable_name = 'kCompiledProgramMap'
)
def main(unused_args):
cpp_cl_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_binary_source)
w_file.close()
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--cl_binary_dirs",
type=str,
default="cl_bin0/,cl_bin1/,cl_bin2/",
help="The cl binaries directories.")
parser.add_argument(
"--output_path",
type=str,
default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
help="The path of generated C++ header file which contains cl binaries.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
//
#include <vector>
#include <string>
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
namespace mace {
namespace {
void UpdateOp(mace::OperatorDef &op,
const std::string &name,
const std::string &type,
const std::vector<std::string> &inputs,
const std::vector<std::string> &outputs,
const std::vector<mace::DataType> &output_types,
uint32_t node_id,
const std::vector<int> &mem_ids) {
op.set_name(name);
op.set_type(type);
op.set_input(inputs);
op.set_output(outputs);
op.set_output_type(output_types);
op.set_node_id(node_id);
op.set_mem_id(mem_ids);
}
} // namespace
} // namespace mace
namespace mace {
namespace {{tag}} {
{% for i in range(start, end) %}
void CreateOperator{{i}}(mace::OperatorDef &op) {
MACE_LATENCY_LOGGER(2, "Create operator {{ net.op[i].name }}");
mace::Argument *arg = nullptr;
{% for arg in net.op[i].arg %}
arg = op.add_arg();
arg->set_name({{ arg.name|tojson }});
{%- if arg.HasField('f') %}
arg->set_f({{ arg.f }});
{%- endif %}
{%- if arg.HasField('i') %}
arg->set_i({{ arg.i }});
{%- endif %}
{%- if arg.HasField('s') %}
arg->set_s({{ arg.s|tojson }});
{%- endif %}
{% if arg.floats|length != 0 %}
arg->set_floats({ {{ arg.floats|join(', ') }} });
{% endif %}
{% if arg.ints|length != 0 %}
arg->set_ints({ {{ arg.ints|join(', ') }} });
{% endif %}
{% if arg.strings|length != 0 %}
arg->set_strings({ {{ arg.strings|stringfy() }} });
{% endif %}
{% endfor %}
{% for shape in net.op[i].output_shape %}
{% if shape.dims | length > 0 %}
op.add_output_shape(mace::OutputShape({ {{ shape.dims|join(', ') }} }));
{% endif %}
{% endfor %}
std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});
for (int k = 0; k < {{ net.op[i].output_type | length }}; ++k) {
output_types[k] = static_cast<mace::DataType>(output_types_int[k]);
}
UpdateOp(op, {{ net.op[i].name|tojson }}, {{ net.op[i].type|tojson}},
{ {{ net.op[i].input|stringfy }} },
{ {{ net.op[i].output|stringfy }} },
output_types,
{{ net.op[i].node_id }},
{ {{ net.op[i].mem_id | join(', ') }} });
{% if runtime == 'dsp' %}
op.set_padding({{ net.op[i].padding }});
{% if net.op[i].node_input | length > 0 %}
std::vector<int> input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} });
std::vector<int> input_output_ports({ {{ net.op[i].node_input | map(attribute='output_port') | join(', ')}} });
for (size_t i = 0; i < {{ net.op[i].node_input | length }}; ++i) {
mace::NodeInput input(input_node_ids[i], input_output_ports[i]);
op.add_node_input(input);
}
{% endif %}
{% if net.op[i].out_max_byte_size | length > 0 %}
std::vector<int> out_max_byte_sizes {{ net.op[i].out_max_byte_size | replace('[', '{') | replace(']', '}') }};
for (size_t i = 0; i < {{ net.op[i].out_max_byte_size | length }}; ++i) {
op.add_out_max_byte_size(out_max_byte_sizes[i]);
}
{% endif %}
{% endif %}
}
{% endfor %}
} // namespace {{tag}}
} // namespace mace
import os
import uuid
import numpy as np
import hashlib
from mace.proto import mace_pb2
from jinja2 import Environment, FileSystemLoader
GENERATED_NAME = set()
def generate_obfuscated_name(namespace, name):
md5 = hashlib.md5()
md5.update(namespace)
md5.update(name)
md5_digest = md5.hexdigest()
name = md5_digest[:8]
while name in GENERATED_NAME:
name = md5_digest
assert name not in GENERATED_NAME
GENERATED_NAME.add(name)
return name
def generate_tensor_map(tensors):
tensor_map = {}
for t in tensors:
if not tensor_map.has_key(t.name):
tensor_map[t.name] = generate_obfuscated_name("tensor", t.name)
return tensor_map
def generate_in_out_map(ops, tensor_map):
in_out_map = {}
for op in ops:
op.name = generate_obfuscated_name("op", op.name)
for input_name in op.input:
if not in_out_map.has_key(input_name):
if tensor_map.has_key(input_name):
in_out_map[input_name] = tensor_map[input_name]
else:
in_out_map[input_name] = generate_obfuscated_name("in", input_name)
for output_name in op.output:
if not in_out_map.has_key(output_name):
if tensor_map.has_key(output_name):
in_out_map[output_name] = tensor_map[output_name]
else:
in_out_map[output_name] = generate_obfuscated_name("out", output_name)
return in_out_map
def obfuscate_name(net_def):
input_node = "mace_input_node"
output_node = "mace_output_node"
tensor_map = generate_tensor_map(net_def.tensors)
in_out_map = generate_in_out_map(net_def.op, tensor_map)
for t in net_def.tensors:
if input_node not in t.name and output_node not in t.name:
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if input_node not in op.input[i]:
op.input[i] = in_out_map[op.input[i]]
for i in range(len(op.output)):
if output_node not in op.output[i]:
op.output[i] = in_out_map[op.output[i]]
def rename_tensor(net_def):
tensor_map = {}
for t in net_def.tensors:
if not tensor_map.has_key(t.name):
tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if tensor_map.has_key(op.input[i]):
op.input[i] = tensor_map[op.input[i]]
for i in range(len(op.output)):
if tensor_map.has_key(op.output[i]):
op.output[i] = tensor_map[op.output[i]]
class TensorInfo:
def __init__(self, id, t, runtime):
self.id = id
self.data_type = mace_pb2.DataType.Name(t.data_type)
if t.data_type == mace_pb2.DT_FLOAT:
if runtime == 'gpu':
self.data_type = mace_pb2.DT_HALF
self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes())
else:
self.data_type = mace_pb2.DT_FLOAT
self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes())
elif t.data_type == mace_pb2.DT_INT32:
self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes())
elif t.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist())
def stringfy(value):
return ', '.join('"{0}"'.format(w) for w in value)
def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data):
if obfuscate:
obfuscate_name(net_def)
else:
rename_tensor(net_def)
# Capture our current directory
print template_dir
# Create the jinja2 environment.
j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True)
j2_env.filters['stringfy'] = stringfy
output_dir = os.path.dirname(output) + '/'
# generate tensor source files
template_name = 'tensor_source.template'
model_data = []
offset = 0
counter = 0
for t in net_def.tensors:
tensor_info = TensorInfo(counter, t, runtime)
# align
if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
padding = 4 - offset % 4
model_data.extend(bytearray([0] * padding))
offset += padding
source = j2_env.get_template(template_name).render(
tensor_info = tensor_info,
tensor = t,
tag = model_tag,
runtime = runtime,
offset = offset,
)
model_data.extend(tensor_info.data)
offset += len(tensor_info.data)
with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate tensor data
template_name = 'tensor_data.template'
source = j2_env.get_template(template_name).render(
tag = model_tag,
embed_model_data = embed_model_data,
model_data_size = offset,
model_data = model_data
)
with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
f.write(source)
if not embed_model_data:
f = open(output_dir + model_tag + '.data', "wb")
f.write(bytearray(model_data))
f.close()
# generate op source files
template_name = 'operator.template'
counter = 0
op_size = len(net_def.op)
for start in range(0, op_size, 10):
source = j2_env.get_template(template_name).render(
start = start,
end = min(start+10, op_size),
net = net_def,
tag = model_tag,
runtime = runtime,
)
with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate model source files
template_name = 'model.template'
tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))]
source = j2_env.get_template(template_name).render(
tensors = tensors,
net = net_def,
tag = model_tag,
runtime = runtime,
model_pb_checksum = mode_pb_checksum
)
with open(output, "wb") as f:
f.write(source)
# generate model header file
template_name = 'model_header.template'
source = j2_env.get_template(template_name).render(
tag = model_tag,
)
with open(output_dir + model_tag + '.h', "wb") as f:
f.write(source)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
//
#include <vector>
#include <string>
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
{% if not embed_model_data %}
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
{% endif %}
namespace mace {
namespace {{tag}} {
{% if embed_model_data %}
alignas(4) const unsigned char model_data[{{ model_data_size }}] = {
{% for d in model_data %}{{"0x%02X, " % d }}{%endfor%}
};
{% endif %}
const unsigned char *LoadModelData(const char *model_data_file) {
{% if embed_model_data %}
return model_data;
{% else %}
int fd = open(model_data_file, O_RDONLY);
MACE_CHECK(fd >= 0, "Failed to open model data file ",
model_data_file, ", error code: ", errno);
const unsigned char *model_data =
static_cast<const unsigned char *>(mmap(nullptr, {{ model_data_size }},
PROT_READ, MAP_PRIVATE, fd, 0));
MACE_CHECK(model_data != MAP_FAILED, "Failed to map model data file ",
model_data_file, ", error code: ", errno);
int ret = close(fd);
MACE_CHECK(ret == 0, "Failed to close model data file ",
model_data_file, ", error code: ", errno);
return model_data;
{% endif %}
}
void UnloadModelData(const unsigned char *model_data) {
{% if not embed_model_data %}
int ret = munmap(const_cast<unsigned char *>(model_data),
{{ model_data_size }});
MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", errno);
{% endif %}
}
} // namespace {{tag}}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
// Generated by the mace converter. DO NOT EDIT!
//
#include <vector>
#include <string>
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
namespace mace {
namespace {{tag}} {
void CreateTensor{{tensor_info.id}}(std::vector<mace::ConstTensor> &tensors,
const unsigned char *model_data) {
MACE_LATENCY_LOGGER(2, "Create tensor {{ tensor.name }}");
tensors.emplace_back(mace::ConstTensor(
{{ tensor.name|tojson }}, model_data + {{ offset }},
{ {{ tensor.dims|join(', ') }} }, {{ tensor_info.data_type }}, {{ tensor.node_id }}));
}
} // namespace {{tag}}
} // namespace mace
此差异已折叠。
from mace.proto import mace_pb2
import tensorflow as tf
from tensorflow import gfile
from operator import mul
from dsp_ops import DspOps
from mace.python.tools import graph_util
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \
# --runtime dsp --input_node input_node --output_node output_node
padding_mode = {
'NA': 0,
'SAME': 1,
'VALID': 2,
'MIRROR_REFLECT': 3,
'MIRROR_SYMMETRIC': 4,
'SAME_CAFFE': 5
}
def get_tensor_name_from_op(op_name, port):
return op_name + ':' + str(port)
def get_node_from_map(op_map, op_or_tensor_name):
op_name = op_or_tensor_name.split(':')[0]
return op_map[op_name]
def get_op_and_port_from_tensor(tensor_name):
op, port = tensor_name.split(':')
port = int(port)
return op, port
def max_elem_size(tensor):
if len(tensor.shape.as_list()) == 0:
return tensor.dtype.size
else:
return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
def find_dtype(tensor_dtype):
if tensor_dtype == tf.float32:
return mace_pb2.DT_FLOAT
elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
return mace_pb2.DT_UINT8
elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
return mace_pb2.DT_INT32
else:
raise Exception('Unsupported data type: ', tensor_dtype)
def has_padding_and_strides(op):
return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
def is_node_flatten_reshape(op):
return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
def add_shape_const_node(net_def, op, values, name):
print ('Add const node: ', op.name + '/' + name)
tensor = net_def.tensors.add()
node_name = op.name + '/' + name
tensor.name = node_name + ':0'
tensor.data_type = mace_pb2.DT_INT32
tensor.dims.extend(values)
return tensor.name
def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
output_shapes = []
for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
first_op = unresolved_ops[0]
print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
if first_op.name in resolved_ops:
pass
elif first_op.type == 'Const':
print ('Add const node: ', first_op.name)
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
tensor.data_type = find_dtype(first_op.outputs[0].dtype)
shape = list(tf_tensor.shape)
if len(shape) > 0:
tensor.dims.extend(shape)
if first_op.outputs[0].dtype == tf.float32:
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif first_op.outputs[0].dtype == tf.int32 or \
first_op.outputs[0].dtype == tf.int8 or \
first_op.outputs[0].dtype == tf.int16 or \
first_op.outputs[0].dtype == tf.quint8 or \
first_op.outputs[0].dtype == tf.quint16:
tensor.int32_data.extend(tf_tensor.astype(int).flat)
else:
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = dsp_ops.map_nn_op(first_op.type)
op_def.padding = padding_mode['NA']
if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
s2b_op = first_op.outputs[0].consumers()[0]
reshape_op = s2b_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(s2b_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
op_def.input.append(input_tensor.name)
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \
and len(first_op.outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].type == 'Dequantize' \
and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[2]
max_tensor = first_op.inputs[3]
dequantize_op = first_op.outputs[0].consumers()[0]
softmax_op = dequantize_op.outputs[0].consumers()[0]
reshape_op = softmax_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
resolved_ops.add(dequantize_op.name)
resolved_ops.add(softmax_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
resolved_ops.add(quantize_reshape_op.name)
op_def.name = quantize_reshape_op.name
op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs])
convert_op_outputs(op_def, quantize_reshape_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
if 'ksize' in first_op.node_def.attr:
ksize = first_op.get_attr('ksize')
ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
op_def.input.extend([ksize_tensor])
strides = first_op.get_attr('strides')
strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unsupported op: ', first_op)
resolved_ops.add(first_op.name)
del unresolved_ops[0]
def add_output_node(net_def, output_node):
op_def = net_def.op.add()
op_def.name = '__output__'
op_def.type = 'OUTPUT'
op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
def reverse_batch_to_space_and_biasadd(net_def):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
new_ops = []
skip_ops = set()
visited_ops = set()
for op in net_def.op:
if op.name in visited_ops:
pass
# pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
success = False
if op.type == 'Requantize_32to8':
biasadd_requantize_op = op
biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0])
if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
if b2s_op.type == 'QuantizedBatchToSpaceND_8':
conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0])
conv_op = get_node_from_map(op_map, conv_requantize_op.input[0])
if conv_op.type == 'QuantizedConv2d_8x8to32':
new_biasadd_op = mace_pb2.OperatorDef()
new_biasadd_op.CopyFrom(biasadd_op)
new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0)
new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1)
new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2)
new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4
new_biasadd_requantize_op = mace_pb2.OperatorDef()
new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op)
new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4
new_b2s_op = mace_pb2.OperatorDef()
new_b2s_op.CopyFrom(b2s_op)
new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1)
new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2)
new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op])
skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name])
visited_ops.add(op.name)
follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)]
for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
visited_ops.add(follow_op.name)
visited_ops.add(op.name)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values())
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def add_node_id(net_def):
node_id_counter = 0
node_id_map = {}
for tensor in net_def.tensors:
tensor.node_id = node_id_counter
node_id_counter += 1
tensor_op, port = get_op_and_port_from_tensor(tensor.name)
node_id_map[tensor_op] = tensor.node_id
for op in net_def.op:
op.node_id = node_id_counter
node_id_counter += 1
node_id_map[op.name] = op.node_id
for ipt in op.input:
op_name, port = get_op_and_port_from_tensor(ipt)
node_id = node_id_map[op_name]
node_input = op.node_input.add()
node_input.node_id = node_id
node_input.output_port = int(port)
return net_def
def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
input_info = net_def.input_info.add()
input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add()
input_info.dims.extend([1,1,1,1])
input_info.data_type = mace_pb2.DT_FLOAT
output_info = net_def.output_info.add()
output_info.dims.extend(output_tensor.shape.as_list())
output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add()
output_info.dims.extend([1,1,1,1])
output_info.data_type = mace_pb2.DT_FLOAT
return net_def
def fuse_quantize(net_def, input_node, output_node):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name])
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
quantize_op.type = 'AutoQuantize'
del quantize_op.input[1:]
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode):
"""
nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so
we sort ops first by topology.
"""
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_graph_def = graph_util.sort_tf_graph(input_graph_def)
net_def = mace_pb2.NetDef()
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
dsp_ops = DspOps()
resolved_ops = set()
# convert const node
unresolved_ops = [op for op in ops if op.type == 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
# convert op node
unresolved_ops = [op for op in ops if op.type != 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
add_output_node(net_def, output_node)
net_def = reverse_batch_to_space_and_biasadd(net_def)
net_def = fuse_quantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def)
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
arg = final_net_def.arg.add()
arg.name = 'dsp_mode'
arg.i = dsp_mode
return final_net_def
#!/bin/bash
Usage() {
echo "Usage: bash tools/benchmark.sh model_output_dir"
}
if [ $# -lt 1 ]; then
Usage
exit 1
fi
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
MODEL_OUTPUT_DIR=$1
if [ -f "$MODEL_OUTPUT_DIR/benchmark_model" ]; then
rm -rf $MODEL_OUTPUT_DIR/benchmark_model
fi
if [ "$EMBED_MODEL_DATA" = 0 ]; then
cp codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
fi
if [ x"$TARGET_ABI" == x"host" ]; then
bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--copt="-O3" \
--define openmp=true \
--define production=true || exit 1
cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
${MODEL_OUTPUT_DIR}/benchmark_model \
--model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \
--device=${DEVICE_TYPE} \
--input_node="${INPUT_NODES}" \
--input_shape="${INPUT_SHAPES}"\
--output_node="${OUTPUT_NODES}" \
--output_shape="${OUTPUT_SHAPES}"\
--input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} || exit 1
else
bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--cpu=${TARGET_ABI} \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--copt="-O3" \
--define openmp=true \
--define production=true || exit 1
cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR
adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
for NAME in "${INPUT_NAMES[@]}";do
FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
adb push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} || exit 1
done
adb push ${MODEL_OUTPUT_DIR}/benchmark_model ${PHONE_DATA_DIR} || exit 1
if [ "$EMBED_MODEL_DATA" = 0 ]; then
adb push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
fi
adb </dev/null shell \
LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
MACE_OPENCL_PROFILING=1 \
${PHONE_DATA_DIR}/benchmark_model \
--model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \
--device=${DEVICE_TYPE} \
--input_node="${INPUT_NODES}" \
--input_shape="${INPUT_SHAPES}"\
--output_node="${OUTPUT_NODES}" \
--output_shape="${OUTPUT_SHAPES}"\
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} || exit 1
fi
#!/bin/bash
Usage() {
echo "Usage: bash tools/build_mace_run.sh production_mode model_output_dir hexagon_mode"
}
if [ $# -lt 3 ]; then
Usage
exit 1
fi
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
PRODUCTION_MODE=$1
MODEL_OUTPUT_DIR=$2
HEXAGON_MODE=$3
if [ "$PRODUCTION_MODE" = 1 ]; then
PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
fi
if [ x"$TARGET_ABI" = x"host" ]; then
bazel build --verbose_failures -c opt --strip always //mace/codegen:generated_models \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--copt="-O3" \
$PRODUCTION_MODE_BUILD_FLAGS || exit 1
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--copt="-O3" \
$PRODUCTION_MODE_BUILD_FLAGS || exit 1
else
if [ "$HEXAGON_MODE" = 1 ]; then
HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
fi
bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--cpu=${TARGET_ABI} \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_OBFUSCATE_LITERALS" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--copt="-O3" \
$PRODUCTION_MODE_BUILD_FLAGS \
$HEXAGON_MODE_BUILD_FLAG || exit 1
fi
if [ "$PRODUCTION_MODE" = 1 ]; then
cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
fi
if [ -f "$MODEL_OUTPUT_DIR/mace_run" ]; then
rm -rf $MODEL_OUTPUT_DIR/mace_run
fi
cp bazel-bin/mace/examples/mace_run $MODEL_OUTPUT_DIR
if [ "$EMBED_MODEL_DATA" = 0 ]; then
cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
fi
# copy model header file to build output dir
cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.h $MODEL_OUTPUT_DIR
#!/bin/bash
Usage() {
echo "Usage: bash tools/build_production_code.sh"
}
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
build_host_target()
{
BAZEL_TARGET=$1
bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_OBFUSCATE_LITERALS" \
--copt="-O3" \
--define openmp=true || exit -1
}
build_target()
{
BAZEL_TARGET=$1
bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--cpu=$TARGET_ABI \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-O3" \
--define openmp=true \
--copt="-DMACE_OBFUSCATE_LITERALS" || exit 1
}
if [ x"$TARGET_ABI" = x"host" ]; then
build_host_target //mace/codegen:generated_opencl_prod
build_host_target //mace/codegen:generated_tuning_params
else
build_target //mace/codegen:generated_opencl_prod
build_target //mace/codegen:generated_tuning_params
fi
#!/bin/bash
Usage() {
echo "Usage: bash tools/build_run_throughput_test.sh run_seconds merged_lib_file model_input_dir"
}
if [ $# -lt 1 ]; then
Usage
exit 1
fi
RUN_SECONDS=$1
MERGED_LIB_FILE=$2
MODEL_INPUT_DIR=$3
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
if [ "$CPU_MODEL_TAG" != '' ]; then
CPU_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_CPU_MODEL_TAG=${CPU_MODEL_TAG}"
fi
if [ "$GPU_MODEL_TAG" != '' ]; then
GPU_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_GPU_MODEL_TAG=${GPU_MODEL_TAG}"
fi
if [ "$DSP_MODEL_TAG" != '' ]; then
DSP_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_DSP_MODEL_TAG=${DSP_MODEL_TAG}"
fi
cp $MERGED_LIB_FILE benchmark/libmace_merged.a
bazel build --verbose_failures -c opt --strip always benchmark:model_throughput_test \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--cpu=${TARGET_ABI} \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
${CPU_MODEL_TAG_BUILD_FLAGS} \
${GPU_MODEL_TAG_BUILD_FLAGS} \
${DSP_MODEL_TAG_BUILD_FLAGS} \
--define openmp=true \
--copt="-O3" || exit 1
rm benchmark/libmace_merged.a
adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
adb push ${MODEL_INPUT_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit 1
adb push bazel-bin/benchmark/model_throughput_test ${PHONE_DATA_DIR} || exit 1
if [ "$EMBED_MODEL_DATA" = 0 ]; then
adb push codegen/models/${CPU_MODEL_TAG}/${CPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
adb push codegen/models/${GPU_MODEL_TAG}/${GPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
adb push codegen/models/${DSP_MODEL_TAG}/${DSP_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
fi
adb push lib/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} || exit 1
adb </dev/null shell \
LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \
MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
${PHONE_DATA_DIR}/model_throughput_test \
--input_shape="${INPUT_SHAPE}" \
--output_shape="${OUTPUT_SHAPE}" \
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--cpu_model_data_file=${PHONE_DATA_DIR}/${CPU_MODEL_TAG}.data \
--gpu_model_data_file=${PHONE_DATA_DIR}/${GPU_MODEL_TAG}.data \
--dsp_model_data_file=${PHONE_DATA_DIR}/${DSP_MODEL_TAG}.data \
--run_seconds=$RUN_SECONDS || exit 1
\ No newline at end of file
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
if [ x"$TARGET_ABI" != x"host" ]; then
adb shell rm -rf $PHONE_DATA_DIR
fi
rm -rf mace/codegen/models
git checkout -- mace/codegen/opencl/opencl_compiled_program.cc mace/codegen/tuning/tuning_params.cc
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
#!/bin/bash
CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
python mace/python/tools/encrypt_opencl_codegen.py \
--cl_kernel_dir=./mace/kernels/opencl/cl/ \
--output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
rm -rf ${CODEGEN_DIR}/version
mkdir ${CODEGEN_DIR}/version
bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit 1
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册