diff --git a/WORKSPACE b/WORKSPACE
index fe00c16fecc68cd54f01465f0a8a3bb57fba6278..af04846f4460f86cf64907ce658911f2ded3fdd9 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -56,3 +56,20 @@ android_ndk_repository(
     # Android 5.0
     api_level = 21
 )
+
+git_repository(
+    name   = "com_github_gflags_gflags",
+    #tag    = "v2.2.0",
+    commit = "30dbc81fb5ffdc98ea9b14b1918bfe4e8779b26e", # v2.2.0 + fix of include path
+    remote = "https://github.com/gflags/gflags.git"
+)
+
+bind(
+    name   = "gflags",
+    actual = "@com_github_gflags_gflags//:gflags",
+)
+
+bind(
+    name   = "gflags_nothreads",
+    actual = "@com_github_gflags_gflags//:gflags_nothreads",
+)
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..498d1ea006d692ac10ec211fb9e99c6b25b4986d
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,125 @@
+FROM ubuntu:16.04
+
+# Update source
+# Looks like mirrors.163.com does not work in xiaomi network
+# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list
+RUN apt-get update -y
+
+## Basic tools
+RUN apt-get install -y --no-install-recommends apt-utils
+RUN apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    curl \
+    git \
+    libcurl3-dev \
+    libgoogle-glog-dev \
+    libfreetype6-dev \
+    libpng12-dev \
+    libprotobuf-dev \
+    libzmq3-dev \
+    pkg-config \
+    python-dev \
+    python-pip \
+    protobuf-compiler \
+    rsync \
+    software-properties-common \
+    unzip \
+    zip \
+    zlib1g-dev \
+    openjdk-8-jdk \
+    openjdk-8-jre-headless \
+    openssh-server \
+    wget
+RUN pip install --upgrade pip
+
+ENV ANDROID_NDK_HOME /opt/android-ndk
+ENV ANDROID_NDK /opt/android-ndk
+ENV ANDROID_NDK_VERSION r15c
+
+# Android NDK
+# download
+RUN mkdir /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \
+# uncompress
+    unzip -q android-ndk-${ANDROID_NDK_VERSION}-linux-x86_64.zip && \
+# move to its final location
+    mv ./android-ndk-${ANDROID_NDK_VERSION} ${ANDROID_NDK_HOME} && \
+# remove temp dir
+    cd ${ANDROID_NDK_HOME} && \
+    rm -rf /opt/android-ndk-tmp
+
+# add to PATH
+ENV PATH ${PATH}:${ANDROID_NDK_HOME}
+
+# Set up Bazel.
+# Running bazel inside a `docker build` command causes trouble, cf:
+#   https://github.com/bazelbuild/bazel/issues/134
+# The easiest solution is to set up a bazelrc file forcing --batch.
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
+# Similarly, we need to workaround sandboxing issues:
+#   https://github.com/bazelbuild/bazel/issues/418
+RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
+    >>/etc/bazel.bazelrc
+# Install the most recent bazel release.
+ENV BAZEL_VERSION 0.7.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    wget https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    chmod +x bazel-*.sh && \
+    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    cd / && \
+    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
+
+# Setup vim
+RUN apt-get update -y
+RUN apt-get install -y --no-install-recommends \
+    locales \
+    vim
+
+RUN mkdir -p ~/.vim/autoload ~/.vim/bundle && \
+    curl -LSso ~/.vim/autoload/pathogen.vim https://tpo.pe/pathogen.vim
+
+RUN mkdir -p ~/.vim/bundle
+RUN cd ~/.vim/bundle && \
+    git clone https://github.com/scrooloose/nerdtree.git && \
+    git clone https://github.com/vim-syntastic/syntastic.git && \
+    git clone https://github.com/vim-airline/vim-airline.git && \
+    git clone https://github.com/altercation/vim-colors-solarized.git && \
+    git clone https://github.com/bazelbuild/vim-ft-bzl.git && \
+    git clone https://github.com/google/vim-maktaba.git && \
+    git clone https://github.com/google/vim-codefmt.git
+
+RUN curl -LSso ~/.vimrc  https://gist.githubusercontent.com/llhe/71c5802919debd5825c100c0135478a7/raw/16a35020cc65f9c25d0cf8f11a3ba7b345a1271d/.vimrc
+
+RUN locale-gen en_US.UTF-8
+ENV LC_CTYPE=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+ENV TERM xterm-256color
+
+# Extra packges
+RUN apt-get install -y --no-install-recommends \
+    telnet \
+    net-tools \
+    inetutils-ping \
+    screen \
+    android-tools-adb
+
+# Install tools
+RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools
+RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.4.0 \
+    scipy \
+    jinja2 \
+    pyyaml
+
+# Download tensorflow tools
+RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \
+    chmod +x transform_graph
+
+# Install gitlab runner
+RUN curl -L https://packages.gitlab.com/install/repositories/runner/gitlab-ci-multi-runner/script.deb.sh | bash
+RUN apt-get install gitlab-ci-multi-runner
+
+ENTRYPOINT gitlab-runner run
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5c919e419baf10ea961c29cc6bd422a487922cb
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,27 @@
+# 包含mace环境的docker镜像
+========
+
+* Login in [小米容器仓库](http://docs.api.xiaomi.net/docker-registry/)
+
+  ```
+  docker login cr.d.xiaomi.net
+  ```
+
+* 使用`Dockerfile`编译镜像
+
+  ```
+  docker build -t cr.d.xiaomi.net/mace/mace-dev .
+  ```
+
+* 或者从镜像仓库直接pull镜像
+
+  ```
+  docker push cr.d.xiaomi.net/mace/mace-dev
+  ```
+
+* 启动容器
+
+  ```
+  # Set 'host' network to use ADB
+  docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
+  ```
diff --git a/docker/caffe/Dockerfile b/docker/caffe/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9e94b7ef550ca7230ce87579a1ccd7af18409
--- /dev/null
+++ b/docker/caffe/Dockerfile
@@ -0,0 +1,46 @@
+FROM ubuntu:16.04
+LABEL maintainer caffe-maint@googlegroups.com
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        wget \
+        libatlas-base-dev \
+        libboost-all-dev \
+        libgflags-dev \
+        libgoogle-glog-dev \
+        libhdf5-serial-dev \
+        libleveldb-dev \
+        liblmdb-dev \
+        libopencv-dev \
+        libprotobuf-dev \
+        libsnappy-dev \
+        protobuf-compiler \
+        python-dev \
+        python-numpy \
+        python-pip \
+        python-setuptools \
+        python-scipy && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CAFFE_ROOT=/opt/caffe
+WORKDIR $CAFFE_ROOT
+
+# FIXME: use ARG instead of ENV once DockerHub supports this
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
+
+RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
+    pip install --upgrade pip && \
+    cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
+    mkdir build && cd build && \
+    cmake -DCPU_ONLY=1 .. && \
+    make -j"$(nproc)"
+
+ENV PYCAFFE_ROOT $CAFFE_ROOT/python
+ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
+ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
+RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
+
+WORKDIR /mace
diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..457922879b8b4c630c70ed852e8a4e3138bd6793
--- /dev/null
+++ b/mace/benchmark/BUILD
@@ -0,0 +1,66 @@
+# Benchmark
+# Examples
+load(
+    "//:mace.bzl",
+    "if_production_mode",
+    "if_not_production_mode",
+    "if_hexagon_enabled",
+    "if_openmp_enabled",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "stat_summarizer",
+    srcs = ["stat_summarizer.cc"],
+    hdrs = ["stat_summarizer.h"],
+    linkstatic = 1,
+    deps = [
+        "@mace//:mace_headers",
+    ],
+)
+
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_model.cc",
+    ],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        ":stat_summarizer",
+        "//mace/codegen:generated_models",
+        "//external:gflags_nothreads",
+    ] + if_hexagon_enabled([
+        "//lib/hexagon:hexagon",
+    ]) + if_production_mode([
+        "@mace//:mace_prod",
+        "//codegen:generated_opencl_prod",
+        "//codegen:generated_tuning_params",
+    ]) + if_not_production_mode([
+        "@mace//:mace_dev",
+    ]),
+)
+
+cc_library(
+    name = "libmace_merged",
+    srcs = [
+        "libmace_merged.a",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_binary(
+    name = "model_throughput_test",
+    srcs = ["model_throughput_test.cc"],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        ":libmace_merged",
+        "//external:gflags_nothreads",
+        "//lib/hexagon",
+        "@mace//:mace",
+        "@mace//:mace_headers",
+        "@mace//:mace_prod",
+    ],
+)
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26bfeb618bdeff76c56e9ceb2cb33930a43dc461
--- /dev/null
+++ b/mace/benchmark/benchmark_model.cc
@@ -0,0 +1,362 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+#include "benchmark/stat_summarizer.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <thread>
+#include <numeric>
+#include <sys/time.h>
+
+namespace mace {
+namespace MACE_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}
+}
+
+namespace mace {
+namespace str_util {
+
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+
+bool SplitAndParseToInts(const std::string &str,
+                         char delims,
+                         std::vector<int64_t> *result) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int64_t dim = atoi(tmp.data());
+    result->push_back(dim);
+    size_t next_offset = tmp.find(delims);
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return true;
+}
+
+}  //  namespace str_util
+
+namespace benchmark {
+
+void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!::isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+
+inline int64_t NowMicros() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+
+bool RunInference(MaceEngine *engine,
+                  const std::vector<mace::MaceInputInfo> &input_infos,
+                  std::map<std::string, float*> &output_infos,
+                  StatSummarizer *summarizer,
+                  int64_t *inference_time_us) {
+  RunMetadata run_metadata;
+  RunMetadata *run_metadata_ptr = nullptr;
+  if (summarizer) {
+    run_metadata_ptr = &run_metadata;
+  }
+  if (input_infos.size() == 1 && output_infos.size() == 1) {
+    const int64_t start_time = NowMicros();
+    bool s = engine->Run(input_infos[0].data, input_infos[0].shape,
+                         output_infos.begin()->second, run_metadata_ptr);
+    const int64_t end_time = NowMicros();
+
+    if (!s) {
+      LOG(ERROR) << "Error during inference.";
+      return s;
+    }
+    *inference_time_us = end_time - start_time;
+  } else {
+    const int64_t start_time = NowMicros();
+    bool s = engine->Run(input_infos, output_infos, run_metadata_ptr);
+    const int64_t end_time = NowMicros();
+
+    if (!s) {
+      LOG(ERROR) << "Error during inference.";
+      return s;
+    }
+    *inference_time_us = end_time - start_time;
+  }
+
+  if (summarizer != nullptr) {
+    summarizer->ProcessMetadata(run_metadata);
+  }
+
+  return true;
+}
+
+bool Run(MaceEngine *engine,
+         const std::vector<mace::MaceInputInfo> &input_infos,
+         std::map<std::string, float*> &output_infos,
+         StatSummarizer *summarizer,
+         int num_runs,
+         double max_time_sec,
+         int64_t sleep_sec,
+         int64_t *total_time_us,
+         int64_t *actual_num_runs) {
+  *total_time_us = 0;
+
+  LOG(INFO) << "Running benchmark for max " << num_runs << " iterators, max "
+            << max_time_sec << " seconds "
+            << (summarizer != nullptr ? "with " : "without ")
+            << "detailed stat logging, with " << sleep_sec
+            << "s sleep between inferences";
+
+  Stat<int64_t> stat;
+
+  bool util_max_time = (num_runs <= 0);
+  for (int i = 0; util_max_time || i < num_runs; ++i) {
+    int64_t inference_time_us = 0;
+    bool s = RunInference(engine, input_infos, output_infos, summarizer, &inference_time_us);
+    stat.UpdateStat(inference_time_us);
+    (*total_time_us) += inference_time_us;
+    ++(*actual_num_runs);
+
+    if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
+      break;
+    }
+
+    if (!s) {
+      LOG(INFO) << "Failed on run " << i;
+      return s;
+    }
+
+    if (sleep_sec > 0) {
+      std::this_thread::sleep_for(std::chrono::seconds(sleep_sec));
+    }
+  }
+
+  std::stringstream stream;
+  stat.OutputToStream(&stream);
+  LOG(INFO) << stream.str();
+
+  return true;
+}
+
+DEFINE_string(device, "CPU", "Device [CPU|OPENCL]");
+DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma");
+DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma");
+DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
+DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
+DEFINE_string(input_file, "", "input file name");
+DEFINE_int32(max_num_runs, 100, "number of runs max");
+DEFINE_string(max_time, "10.0", "length to run max");
+DEFINE_string(inference_delay, "-1", "delay between runs in seconds");
+DEFINE_string(inter_benchmark_delay, "-1", "delay between benchmarks in seconds");
+DEFINE_string(benchmark_name, "", "benchmark name");
+DEFINE_bool(show_run_order, true, "whether to list stats by run order");
+DEFINE_int32(run_order_limit, 0, "how many items to show by run order");
+DEFINE_bool(show_time, true, "whether to list stats by time taken");
+DEFINE_int32(time_limit, 10, "how many items to show by time taken");
+DEFINE_bool(show_memory, false, "whether to list stats by memory used");
+DEFINE_int32(memory_limit, 10, "how many items to show by memory used");
+DEFINE_bool(show_type, true, "whether to list stats by op type");
+DEFINE_bool(show_summary, true, "whether to show a summary of the stats");
+DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs");
+DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
+DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0");
+
+int Main(int argc, char **argv) {
+  MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP.");
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]";
+  LOG(INFO) << "Device: [" << FLAGS_device << "]";
+  LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
+  LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]";
+  LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
+  LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]";
+  LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]";
+  LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]";
+  LOG(INFO) << "Inter-inference delay (seconds): [" << FLAGS_inference_delay << "]";
+  LOG(INFO) << "Inter-benchmark delay (seconds): [" << FLAGS_inter_benchmark_delay << "]";
+
+  const long int inter_inference_sleep_seconds =
+      std::strtol(FLAGS_inference_delay.c_str(), nullptr, 10);
+  const long int inter_benchmark_sleep_seconds =
+      std::strtol(FLAGS_inter_benchmark_delay.c_str(), nullptr, 10);
+  const double max_benchmark_time_seconds =
+      std::strtod(FLAGS_max_time.c_str(), nullptr);
+
+  std::unique_ptr<StatSummarizer> stats;
+
+  StatSummarizerOptions stats_options;
+  stats_options.show_run_order = FLAGS_show_run_order;
+  stats_options.run_order_limit = FLAGS_run_order_limit;
+  stats_options.show_time = FLAGS_show_time;
+  stats_options.time_limit = FLAGS_time_limit;
+  stats_options.show_memory = FLAGS_show_memory;
+  stats_options.memory_limit = FLAGS_memory_limit;
+  stats_options.show_type = FLAGS_show_type;
+  stats_options.show_summary = FLAGS_show_summary;
+  stats.reset(new StatSummarizer(stats_options));
+
+  DeviceType device_type = CPU;
+  if(FLAGS_device == "OPENCL") {
+    device_type = OPENCL;
+  }
+
+  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = str_util::Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = str_util::Split(FLAGS_output_shape, ':');
+
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
+  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+
+  const unsigned char *model_data =
+      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+
+  std::vector<mace::MaceInputInfo> input_infos(input_count);
+  std::map<std::string, float*> output_infos;
+  std::vector<std::unique_ptr<float[]>> input_datas(input_count);
+  std::vector<std::unique_ptr<float[]>> output_datas(output_count);
+
+  for (size_t i = 0; i < input_count; ++i) {
+    int64_t input_size = std::accumulate(input_shape_vec[i].begin(),
+                                         input_shape_vec[i].end(), 1, std::multiplies<int64_t>());
+    input_datas[i].reset(new float[input_size]);
+    // load input
+    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary);
+    if (in_file.is_open()) {
+      in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
+                   input_size * sizeof(float));
+      in_file.close();
+    } else {
+      LOG(INFO) << "Open input file failed";
+      return -1;
+    }
+
+    input_infos[i].name = input_names[i];
+    input_infos[i].shape = input_shape_vec[i];
+    input_infos[i].data = input_datas[i].get();
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    int64_t output_size = std::accumulate(output_shape_vec[i].begin(),
+                                          output_shape_vec[i].end(), 1, std::multiplies<int64_t>());
+    output_datas[i].reset(new float[output_size]);
+    output_infos[output_names[i]] = output_datas[i].get();
+  }
+
+  // Init model
+  LOG(INFO) << "Run init";
+  std::unique_ptr<mace::MaceEngine> engine_ptr;
+  if (input_count == 1 && output_count == 1) {
+    engine_ptr.reset(new mace::MaceEngine(&net_def, device_type));
+  } else {
+    engine_ptr.reset(new mace::MaceEngine(&net_def, device_type, input_names, output_names));
+  }
+  if (device_type == DeviceType::OPENCL) {
+    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+  }
+
+  LOG(INFO) << "Warm up";
+
+  int64_t warmup_time_us = 0;
+  int64_t num_warmup_runs = 0;
+  if (FLAGS_warmup_runs > 0) {
+    bool status =
+        Run(engine_ptr.get(), input_infos, output_infos, nullptr, FLAGS_warmup_runs, -1.0,
+            inter_inference_sleep_seconds, &warmup_time_us, &num_warmup_runs);
+    if (!status) {
+      LOG(ERROR) << "Failed at warm up run";
+    }
+  }
+
+  if (inter_benchmark_sleep_seconds > 0) {
+    std::this_thread::sleep_for(
+        std::chrono::seconds(inter_benchmark_sleep_seconds));
+  }
+  int64_t no_stat_time_us = 0;
+  int64_t no_stat_runs = 0;
+  bool status =
+      Run(engine_ptr.get(), input_infos, output_infos,
+          nullptr, FLAGS_max_num_runs, max_benchmark_time_seconds,
+          inter_inference_sleep_seconds, &no_stat_time_us, &no_stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal no-stat run";
+  }
+
+  int64_t stat_time_us = 0;
+  int64_t stat_runs = 0;
+  status = Run(engine_ptr.get(), input_infos, output_infos,
+               stats.get(), FLAGS_max_num_runs, max_benchmark_time_seconds,
+               inter_inference_sleep_seconds, &stat_time_us, &stat_runs);
+  if (!status) {
+    LOG(ERROR) << "Failed at normal stat run";
+  }
+
+  LOG(INFO) << "Average inference timings in us: "
+            << "Warmup: "
+            << (FLAGS_warmup_runs > 0 ? warmup_time_us / FLAGS_warmup_runs : 0) << ", "
+            << "no stats: " << no_stat_time_us / no_stat_runs << ", "
+            << "with stats: " << stat_time_us / stat_runs;
+
+  stats->PrintOperatorStats();
+
+  return 0;
+}
+
+}  //  namespace benchmark
+}  //  namespace mace
+
+int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48e622cb7b79b026b89ad6eb1fb4349056b2d9e8
--- /dev/null
+++ b/mace/benchmark/model_throughput_test.cc
@@ -0,0 +1,281 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+/**
+ * Usage:
+ * throughput_test \
+ *          --input_shape=1,224,224,3   \
+ *          --output_shape=1,224,224,2   \
+ *          --input_file=input_data \
+ *          --cpu_model_data_file=cpu_model_data.data \
+ *          --gpu_model_data_file=gpu_model_data.data \
+ *          --dsp_model_data_file=dsp_model_data.data \
+ *          --run_seconds=10
+ */
+#include <malloc.h>
+#include <stdint.h>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+using namespace std;
+using namespace mace;
+
+namespace mace {
+
+#ifdef MACE_CPU_MODEL_TAG
+namespace MACE_CPU_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_CPU_MODEL_TAG
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+namespace MACE_GPU_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_GPU_MODEL_TAG
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+namespace MACE_DSP_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_DSP_MODEL_TAG
+#endif
+
+}  // namespace mace
+
+void ParseShape(const string &str, vector<int64_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+DeviceType ParseDeviceType(const string &device_str) {
+  if (device_str.compare("CPU") == 0) {
+    return DeviceType::CPU;
+  } else if (device_str.compare("NEON") == 0) {
+    return DeviceType::NEON;
+  } else if (device_str.compare("OPENCL") == 0) {
+    return DeviceType::OPENCL;
+  } else if (device_str.compare("HEXAGON") == 0) {
+    return DeviceType::HEXAGON;
+  } else {
+    return DeviceType::CPU;
+  }
+}
+
+DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
+DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
+DEFINE_string(input_file, "", "input file name");
+DEFINE_string(cpu_model_data_file, "", "cpu model data file name");
+DEFINE_string(gpu_model_data_file, "", "gpu model data file name");
+DEFINE_string(dsp_model_data_file, "", "dsp model data file name");
+DEFINE_int32(run_seconds, 10, "run seconds");
+
+int main(int argc, char **argv) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "mace version: " << MaceVersion();
+  LOG(INFO) << "mace git version: " << MaceGitVersion();
+#ifdef MACE_CPU_MODEL_TAG
+  LOG(INFO) << "cpu model checksum: " << mace::MACE_CPU_MODEL_TAG::ModelChecksum();
+#endif
+#ifdef MACE_GPU_MODEL_TAG
+  LOG(INFO) << "gpu model checksum: " << mace::MACE_GPU_MODEL_TAG::ModelChecksum();
+#endif
+#ifdef MACE_DSP_MODEL_TAG
+  LOG(INFO) << "dsp model checksum: " << mace::MACE_DSP_MODEL_TAG::ModelChecksum();
+#endif
+  LOG(INFO) << "input_shape: " << FLAGS_input_shape;
+  LOG(INFO) << "output_shape: " << FLAGS_output_shape;
+  LOG(INFO) << "input_file: " << FLAGS_input_file;
+  LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
+  LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file;
+  LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
+  LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
+
+  vector<int64_t> input_shape_vec;
+  vector<int64_t> output_shape_vec;
+  ParseShape(FLAGS_input_shape, &input_shape_vec);
+  ParseShape(FLAGS_output_shape, &output_shape_vec);
+
+  int64_t input_size =
+      std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1,
+                      std::multiplies<int64_t>());
+  int64_t output_size =
+      std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1,
+                      std::multiplies<int64_t>());
+  std::unique_ptr<float[]> input_data(new float[input_size]);
+  std::unique_ptr<float[]> cpu_output_data(new float[output_size]);
+  std::unique_ptr<float[]> gpu_output_data(new float[output_size]);
+  std::unique_ptr<float[]> dsp_output_data(new float[output_size]);
+
+  // load input
+  ifstream in_file(FLAGS_input_file, ios::in | ios::binary);
+  if (in_file.is_open()) {
+    in_file.read(reinterpret_cast<char *>(input_data.get()),
+                 input_size * sizeof(float));
+    in_file.close();
+  } else {
+    LOG(INFO) << "Open input file failed";
+    return -1;
+  }
+
+  int64_t t0, t1, init_micros;
+#ifdef MACE_CPU_MODEL_TAG
+  /* --------------------- CPU init ----------------------- */
+  LOG(INFO) << "Load & init cpu model and warm up";
+  const unsigned char *cpu_model_data =
+      mace::MACE_CPU_MODEL_TAG::LoadModelData(FLAGS_cpu_model_data_file.c_str());
+  NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
+
+  mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU);
+
+  LOG(INFO) << "CPU Warm up run";
+  t0 = NowMicros();
+  cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+  /* --------------------- GPU init ----------------------- */
+  LOG(INFO) << "Load & init gpu model and warm up";
+  const unsigned char *gpu_model_data =
+      mace::MACE_GPU_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
+  NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
+
+  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL);
+  mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
+
+  LOG(INFO) << "GPU Warm up run";
+  t0 = NowMicros();
+  gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+  /* --------------------- DSP init ----------------------- */
+  LOG(INFO) << "Load & init dsp model and warm up";
+  const unsigned char *dsp_model_data =
+      mace::MACE_DSP_MODEL_TAG::LoadModelData(FLAGS_gpu_model_data_file.c_str());
+  NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
+
+  mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON);
+  mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
+
+  LOG(INFO) << "DSP Warm up run";
+  t0 = NowMicros();
+  gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
+#endif
+
+  double cpu_throughput = 0;
+  double gpu_throughput = 0;
+  double dsp_throughput = 0;
+  int64_t run_micros = FLAGS_run_seconds * 1000000;
+
+#ifdef MACE_CPU_MODEL_TAG
+  std::thread cpu_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    cpu_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+#ifdef MACE_GPU_MODEL_TAG
+  std::thread gpu_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    gpu_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+#ifdef MACE_DSP_MODEL_TAG
+  std::thread dsp_thread([&]() {
+    int64_t frames = 0;
+    int64_t micros = 0;
+    int64_t start = NowMicros();
+    for (; micros < run_micros; ++frames) {
+      dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+      int64_t end = NowMicros();
+      micros = end - start;
+    }
+    dsp_throughput = frames * 1000000.0 / micros;
+  });
+#endif
+
+  double total_throughput = 0;
+
+#ifdef MACE_CPU_MODEL_TAG
+  cpu_thread.join();
+  LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s";
+  total_throughput += cpu_throughput;
+#endif
+#ifdef MACE_GPU_MODEL_TAG
+  gpu_thread.join();
+  LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s";
+  total_throughput += gpu_throughput;
+#endif
+#ifdef MACE_DSP_MODEL_TAG
+  dsp_thread.join();
+  LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s";
+  total_throughput += dsp_throughput;
+#endif
+
+  LOG(INFO) << "Total throughput: " << total_throughput << " f/s";
+}
diff --git a/mace/benchmark/stat_summarizer.cc b/mace/benchmark/stat_summarizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68801fb26aef120b74a23cf561056f9f57398c6f
--- /dev/null
+++ b/mace/benchmark/stat_summarizer.cc
@@ -0,0 +1,320 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "benchmark/stat_summarizer.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+
+#include <iomanip>
+#include <queue>
+#include <iostream>
+
+namespace mace {
+
+StatSummarizer::StatSummarizer(const StatSummarizerOptions &options)
+    : options_(options) {}
+
+StatSummarizer::~StatSummarizer() {}
+
+void StatSummarizer::Reset() {
+  run_total_us_.Reset();
+  memory_.Reset();
+  details_.clear();
+}
+
+void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
+  int64_t curr_total_us = 0;
+  int64_t mem_total = 0;
+
+  if (run_metadata.op_stats.empty()) {
+    std::cerr << "Runtime op stats should not be empty" << std::endl;
+    abort();
+  }
+  int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
+
+  int node_num = 0;
+  for (const auto &ops : run_metadata.op_stats) {
+    std::string name = ops.operator_name;
+    std::string op_type = ops.type;
+
+    ++node_num;
+    const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
+    curr_total_us += curr_time;
+    auto result = details_.emplace(name, Detail());
+    Detail *detail = &(result.first->second);
+
+    detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
+    detail->rel_end_us.UpdateStat(curr_time);
+
+    // If this is the first pass, initialize some values.
+    if (result.second) {
+      detail->name = name;
+      detail->type = op_type;
+
+      detail->run_order = node_num;
+
+      detail->times_called = 0;
+    }
+
+    ++detail->times_called;
+  }
+
+  run_total_us_.UpdateStat(curr_total_us);
+  memory_.UpdateStat(mem_total);
+}
+
+std::string StatSummarizer::ShortSummary() const {
+  std::stringstream stream;
+  stream << "Timings (microseconds): ";
+  run_total_us_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << "Memory (bytes): ";
+  memory_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << details_.size() << " nodes observed" << std::endl;
+  return stream.str();
+}
+
+std::ostream &InitField(std::ostream &stream, int width) {
+  stream << "\t" << std::right << std::setw(width) << std::fixed
+         << std::setprecision(3);
+  return stream;
+}
+
+std::string StatSummarizer::HeaderString(const std::string &title) const {
+  std::stringstream stream;
+
+  stream << "============================== " << title
+         << " ==============================" << std::endl;
+
+  InitField(stream, 14) << "[node type]";
+  InitField(stream, 9) << "[start]";
+  InitField(stream, 9) << "[first]";
+  InitField(stream, 9) << "[avg ms]";
+  InitField(stream, 8) << "[%]";
+  InitField(stream, 8) << "[cdf%]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 9) << "[times called]";
+  stream << "\t"
+         << "[Name]";
+  return stream.str();
+}
+
+std::string StatSummarizer::ColumnString(const StatSummarizer::Detail &detail,
+                                         const int64_t cumulative_stat_on_node,
+                                         const Stat<int64_t> &stat) const {
+  const double start_ms = detail.start_us.avg() / 1000.0;
+  const double first_time_ms = detail.rel_end_us.first() / 1000.0;
+  const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
+  const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
+  const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
+  const int64_t times_called = detail.times_called / num_runs();
+
+  std::stringstream stream;
+  InitField(stream, 14) << detail.type;
+  InitField(stream, 9) << start_ms;
+  InitField(stream, 9) << first_time_ms;
+  InitField(stream, 9) << avg_time_ms;
+  InitField(stream, 7) << percentage << "%";
+  InitField(stream, 7) << cdf_percentage << "%";
+  InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
+  InitField(stream, 9) << times_called;
+  stream << "\t" << detail.name;
+
+  return stream.str();
+}
+
+void StatSummarizer::OrderNodesByMetric(
+    SortingMetric metric, std::vector<const Detail *> *details) const {
+  std::priority_queue<std::pair<std::string, const Detail *>> sorted_list;
+  const int num_nodes = details_.size();
+
+  for (const auto &det : details_) {
+    const Detail *detail = &(det.second);
+    std::stringstream stream;
+    stream << std::setw(20) << std::right << std::setprecision(10)
+           << std::fixed;
+
+    switch (metric) {
+      case BY_NAME:
+        stream << detail->name;
+        break;
+      case BY_RUN_ORDER:
+        stream << num_nodes - detail->run_order;
+        break;
+      case BY_TIME:
+        stream << detail->rel_end_us.avg();
+        break;
+      case BY_MEMORY:
+        stream << detail->mem_used.avg();
+        break;
+      case BY_TYPE:
+        stream << detail->type;
+        break;
+      default:
+        stream << "";
+        break;
+    }
+
+    sorted_list.emplace(stream.str(), detail);
+  }
+
+  while (!sorted_list.empty()) {
+    auto entry = sorted_list.top();
+    sorted_list.pop();
+    details->push_back(entry.second);
+  }
+}
+
+void StatSummarizer::ComputeStatsByType(
+    std::map<std::string, int64_t> *node_type_map_count,
+    std::map<std::string, int64_t> *node_type_map_time,
+    std::map<std::string, int64_t> *node_type_map_memory,
+    std::map<std::string, int64_t> *node_type_map_times_called,
+    int64_t *accumulated_us) const {
+  int64_t run_count = run_total_us_.count();
+
+  for (const auto &det : details_) {
+    const std::string node_name = det.first;
+    const Detail &detail = det.second;
+
+    int64_t curr_time_val =
+        static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
+    *accumulated_us += curr_time_val;
+
+    int64_t curr_memory_val = detail.mem_used.newest();
+
+    const std::string &node_type = detail.type;
+
+    (*node_type_map_count)[node_type] += 1;
+    (*node_type_map_time)[node_type] += curr_time_val;
+    (*node_type_map_memory)[node_type] += curr_memory_val;
+    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
+  }
+}
+
+std::string StatSummarizer::GetStatsByNodeType() const {
+  std::stringstream stream;
+
+  stream << "============================== Summary by node type "
+            "=============================="
+         << std::endl;
+
+  LOG(INFO) << "Number of nodes executed: " << details_.size() << std::endl;
+
+  std::map<std::string, int64_t> node_type_map_count;
+  std::map<std::string, int64_t> node_type_map_time;
+  std::map<std::string, int64_t> node_type_map_memory;
+  std::map<std::string, int64_t> node_type_map_times_called;
+  int64_t accumulated_us = 0;
+
+  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                     &node_type_map_memory, &node_type_map_times_called,
+                     &accumulated_us);
+
+  // Sort them.
+  std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
+      timings;
+  for (const auto &node_type : node_type_map_time) {
+    const int64_t mem_used = node_type_map_memory[node_type.first];
+    timings.emplace(node_type.second,
+                    std::pair<std::string, int64_t>(node_type.first, mem_used));
+  }
+
+  InitField(stream, 14) << "[Node type]";
+  InitField(stream, 9) << "[count]";
+  InitField(stream, 10) << "[avg ms]";
+  InitField(stream, 11) << "[avg %]";
+  InitField(stream, 11) << "[cdf %]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 10) << "[times called]";
+  stream << std::endl;
+
+  float cdf = 0.0f;
+  while (!timings.empty()) {
+    auto entry = timings.top();
+    timings.pop();
+
+    const std::string node_type = entry.second.first;
+    const float memory = entry.second.second / 1000.0f;
+
+    const int64_t node_type_total_us = entry.first;
+    const float time_per_run_ms = node_type_total_us / 1000.0f;
+
+    const float percentage =
+        ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
+    cdf += percentage;
+
+    InitField(stream, 14) << node_type;
+    InitField(stream, 9) << node_type_map_count[node_type];
+    InitField(stream, 10) << time_per_run_ms;
+    InitField(stream, 10) << percentage << "%";
+    InitField(stream, 10) << cdf << "%";
+    InitField(stream, 10) << memory;
+    InitField(stream, 9) << node_type_map_times_called[node_type];
+    stream << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatSummarizer::GetStatsByMetric(const std::string &title,
+                                             SortingMetric sorting_metric,
+                                             int num_stats) const {
+  std::vector<const Detail *> details;
+  OrderNodesByMetric(sorting_metric, &details);
+
+  double cumulative_stat_on_node = 0;
+
+  std::stringstream stream;
+  stream << HeaderString(title) << std::endl;
+  int stat_num = 0;
+  for (auto detail : details) {
+    ++stat_num;
+    if (num_stats > 0 && stat_num > num_stats) {
+      break;
+    }
+
+    cumulative_stat_on_node += detail->rel_end_us.sum();
+    stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
+           << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatSummarizer::GetOutputString() const {
+  std::stringstream stream;
+  if (options_.show_run_order) {
+    stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
+                               options_.run_order_limit);
+  }
+  if (options_.show_time) {
+    stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
+                               options_.time_limit);
+  }
+  if (options_.show_memory) {
+    stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
+                               options_.memory_limit);
+  }
+  if (options_.show_type) {
+    stream << GetStatsByNodeType();
+  }
+  if (options_.show_summary) {
+    stream << ShortSummary() << std::endl;
+  }
+  return stream.str();
+}
+
+void StatSummarizer::PrintOperatorStats() const {
+  std::string output = GetOutputString();
+  std::istringstream iss(output);
+  for (std::string line; std::getline(iss, line);) {
+    LOG(INFO) << line;
+  }
+}
+
+}  // namespace mace
diff --git a/mace/benchmark/stat_summarizer.h b/mace/benchmark/stat_summarizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..368b1e30299b77467b847396eeb20df586895618
--- /dev/null
+++ b/mace/benchmark/stat_summarizer.h
@@ -0,0 +1,199 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+#define MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
+
+#include <stdlib.h>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace mace {
+
+class RunMetadata;
+
+template <typename ValueType, typename HighPrecisionValueType = double>
+class Stat {
+ public:
+  void UpdateStat(ValueType v) {
+    if (count_ == 0) {
+      first_ = v;
+    }
+
+    newest_ = v;
+    max_ = std::max(v, max_);
+    min_ = std::min(v, min_);
+    ++count_;
+    sum_ += v;
+    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
+  }
+
+  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
+
+  bool empty() const { return count_ == 0; }
+
+  ValueType first() const { return first_; }
+
+  ValueType newest() const { return newest_; }
+
+  ValueType max() const { return max_; }
+
+  ValueType min() const { return min_; }
+
+  int64_t count() const { return count_; }
+
+  ValueType sum() const { return sum_; }
+
+  HighPrecisionValueType squared_sum() const { return squared_sum_; }
+
+  bool all_same() const { return (count_ == 0 || min_ == max_); }
+
+  HighPrecisionValueType avg() const {
+    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
+                   : static_cast<HighPrecisionValueType>(sum_) / count_;
+  }
+
+  ValueType std_deviation() const {
+    return all_same() ? 0 : std::sqrt(squared_sum_ / count_ - avg() * avg());
+  }
+
+  void OutputToStream(std::ostream *stream) const {
+    if (empty()) {
+      *stream << "count=0";
+    } else if (all_same()) {
+      *stream << "count=" << count_ << " curr=" << newest_;
+      if (count_ > 1) *stream << "(all same)";
+    } else {
+      *stream << "count=" << count_ << " first=" << first_
+              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
+              << " avg=" << avg() << " std=" << std_deviation();
+    }
+  }
+
+  friend std::ostream &operator<<(std::ostream &stream,
+                                  const Stat<ValueType> &stat) {
+    stat.OutputToStream(&stream);
+    return stream;
+  }
+
+ private:
+  ValueType first_ = 0;
+  ValueType newest_ = 0;
+  ValueType max_ = std::numeric_limits<ValueType>::min();
+  ValueType min_ = std::numeric_limits<ValueType>::max();
+  int64_t count_ = 0;
+  ValueType sum_ = 0;
+  HighPrecisionValueType squared_sum_ = 0;
+};
+
+// Used to control the output of the statistics summarizer;
+class StatSummarizerOptions {
+ public:
+  StatSummarizerOptions()
+      : show_run_order(true),
+        run_order_limit(0),
+        show_time(true),
+        time_limit(10),
+        show_memory(true),
+        memory_limit(10),
+        show_type(true),
+        show_summary(true) {}
+
+  bool show_run_order;
+  int run_order_limit;
+  bool show_time;
+  int time_limit;
+  bool show_memory;
+  int memory_limit;
+  bool show_type;
+  bool show_summary;
+};
+
+// A StatSummarizer assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc. across
+// multiple executions of a single Graph from the StepStats collected during
+// graph execution.
+//
+// See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
+class StatSummarizer {
+ public:
+  enum SortingMetric {
+    BY_NAME,
+    BY_RUN_ORDER,
+    BY_TIME,
+    BY_MEMORY,
+    BY_TYPE,
+  };
+
+  explicit StatSummarizer(const StatSummarizerOptions &options);
+
+  ~StatSummarizer();
+
+  // Adds another run's StepStats output to the aggregate counts.
+  void ProcessMetadata(const RunMetadata &run_metadata);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const;
+
+  std::string ShortSummary() const;
+
+  // Prints the string returned by GetOutputString().
+  void PrintOperatorStats() const;
+
+  void ComputeStatsByType(
+      std::map<std::string, int64_t> *node_type_map_count,
+      std::map<std::string, int64_t> *node_type_map_time,
+      std::map<std::string, int64_t> *node_type_map_memory,
+      std::map<std::string, int64_t> *node_type_map_times_called,
+      int64_t *accumulated_us) const;
+
+  std::string GetStatsByNodeType() const;
+
+  std::string GetStatsByMetric(const std::string &title,
+                               SortingMetric sorting_metric,
+                               int num_stats) const;
+
+  void Reset();
+
+  // Returns number of runs.
+  int num_runs() const { return run_total_us_.count(); }
+
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t> &run_total_us() const { return run_total_us_; }
+
+ private:
+  struct Detail {
+    std::string name;
+    std::string type;
+    int64_t run_order;
+    Stat<int64_t> start_us;
+    Stat<int64_t> rel_end_us;
+    Stat<int64_t> mem_used;
+    int64_t times_called;
+  };
+
+  void OrderNodesByMetric(SortingMetric sorting_metric,
+                          std::vector<const Detail *> *details) const;
+
+  std::string HeaderString(const std::string &title) const;
+  std::string ColumnString(const Detail &detail,
+                           const int64_t cumulative_stat_on_node,
+                           const Stat<int64_t> &stat) const;
+
+  Stat<int64_t> run_total_us_;
+  Stat<int64_t> memory_;
+
+  std::map<std::string, Detail> details_;
+  StatSummarizerOptions options_;
+};
+
+}  //  namespace mace
+
+#endif  // MACE_TOOLS_BENCHMARK_STAT_SUMMARIZER_H_
diff --git a/mace/core/BUILD b/mace/core/BUILD
index ad32a030ce300be1d37e85e74ec6b60f648b0851..962f8423a88364d807b3f404044632db70300572 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -104,6 +104,7 @@ cc_library(
     deps = [
         ":opencl_headers",
         "//mace/codegen:generated_opencl_prod",
+        "//mace/codegen:generated_tuning_params",
         "//mace/utils",
     ],
 )
diff --git a/mace/examples/BUILD b/mace/examples/BUILD
index ff47e1d98fcca18a96cee0c9a69bd5a7101554db..af806dcc06688984b7df812f1fe9dc9e77ed93d3 100644
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -24,3 +24,14 @@ cc_test(
         "//mace/core:test_benchmark_main",
     ],
 )
+
+cc_binary(
+    name = "mace_run",
+    srcs = ["mace_run.cc"],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        "//mace/codegen:generated_models",
+        "//external:gflags_nothreads",
+    ],
+)
diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c188c274b8fddb3fddb9c6007d2245a3cdebd06
--- /dev/null
+++ b/mace/examples/mace_run.cc
@@ -0,0 +1,407 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+/**
+ * Usage:
+ * mace_run --model=mobi_mace.pb \
+ *          --input=input_node  \
+ *          --output=output_node  \
+ *          --input_shape=1,224,224,3   \
+ *          --output_shape=1,224,224,2   \
+ *          --input_file=input_data \
+ *          --output_file=mace.out  \
+ *          --model_data_file=model_data.data \
+ *          --device=OPENCL
+ */
+#include <malloc.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <time.h>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+using namespace std;
+using namespace mace;
+
+namespace mace {
+namespace MACE_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_MODEL_TAG
+}  // namespace mace
+
+
+namespace str_util {
+
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+
+}  // namespace str_util
+
+void ParseShape(const string &str, vector<int64_t> *shape) {
+  string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+
+DeviceType ParseDeviceType(const string &device_str) {
+  if (device_str.compare("CPU") == 0) {
+    return DeviceType::CPU;
+  } else if (device_str.compare("NEON") == 0) {
+    return DeviceType::NEON;
+  } else if (device_str.compare("OPENCL") == 0) {
+    return DeviceType::OPENCL;
+  } else if (device_str.compare("HEXAGON") == 0) {
+    return DeviceType::HEXAGON;
+  } else {
+    return DeviceType::CPU;
+  }
+}
+
+struct mallinfo LogMallinfoChange(struct mallinfo prev) {
+  struct mallinfo curr = mallinfo();
+  if (prev.arena != curr.arena) {
+    LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena
+              << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
+  }
+  if (prev.ordblks != curr.ordblks) {
+    LOG(INFO) << "Number of free chunks: " << curr.ordblks
+              << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
+  }
+  if (prev.smblks != curr.smblks) {
+    LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks
+              << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
+  }
+  if (prev.hblks != curr.hblks) {
+    LOG(INFO) << "Number of mmapped regions: " << curr.hblks
+              << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
+  }
+  if (prev.hblkhd != curr.hblkhd) {
+    LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
+              << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
+  }
+  if (prev.usmblks != curr.usmblks) {
+    LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks
+              << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
+  }
+  if (prev.fsmblks != curr.fsmblks) {
+    LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
+              << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
+  }
+  if (prev.uordblks != curr.uordblks) {
+    LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks
+              << ", diff: "
+              << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
+  }
+  if (prev.fordblks != curr.fordblks) {
+    LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: "
+              << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
+  }
+  if (prev.keepcost != curr.keepcost) {
+    LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost
+              << ", diff: "
+              << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
+  }
+  return curr;
+}
+
+DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma");
+DEFINE_string(input_shape, "1,224,224,3:1,1,1,10", "input shapes, separated by colon and comma");
+DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma");
+DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma");
+DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs.");
+DEFINE_string(output_file, "", "output file name | output file prefix for multiple outputs");
+DEFINE_string(model_data_file, "",
+              "model data file name, used when EMBED_MODEL_DATA set to 0");
+DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
+DEFINE_int32(round, 1, "round");
+DEFINE_int32(restart_round, 1, "restart round");
+DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
+
+bool SingleInputAndOutput(const std::vector<int64_t> &input_shape,
+                          const std::vector<int64_t> &output_shape) {
+  // load model
+  int64_t t0 = NowMicros();
+  const unsigned char *model_data =
+      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+  int64_t t1 = NowMicros();
+  LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
+  int64_t init_micros = t1 - t0;
+
+  DeviceType device_type = ParseDeviceType(FLAGS_device);
+  LOG(INFO) << "Runing with device type: " << device_type;
+
+  // Init model
+  LOG(INFO) << "Run init";
+  t0 = NowMicros();
+  mace::MaceEngine engine(&net_def, device_type);
+  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+  }
+  t1 = NowMicros();
+  init_micros += t1 - t0;
+  LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
+  LOG(INFO) << "Total init latency: " << init_micros << " us";
+
+  // Allocate input and output
+  int64_t input_size =
+      std::accumulate(input_shape.begin(), input_shape.end(), 1,
+                      std::multiplies<int64_t>());
+  int64_t output_size =
+      std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                      std::multiplies<int64_t>());
+  std::unique_ptr<float[]> input_data(new float[input_size]);
+  std::unique_ptr<float[]> output_data(new float[output_size]);
+
+  // load input
+  ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node), ios::in | ios::binary);
+  if (in_file.is_open()) {
+    in_file.read(reinterpret_cast<char *>(input_data.get()),
+                 input_size * sizeof(float));
+    in_file.close();
+  } else {
+    LOG(INFO) << "Open input file failed";
+    return -1;
+  }
+
+  LOG(INFO) << "Warm up run";
+  t0 = NowMicros();
+  engine.Run(input_data.get(), input_shape, output_data.get());
+  t1 = NowMicros();
+  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
+
+  if (FLAGS_round > 0) {
+    LOG(INFO) << "Run model";
+    t0 = NowMicros();
+    struct mallinfo prev = mallinfo();
+    for (int i = 0; i < FLAGS_round; ++i) {
+      engine.Run(input_data.get(), input_shape, output_data.get());
+      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
+        LOG(INFO) << "=== check malloc info change #" << i << " ===";
+        prev = LogMallinfoChange(prev);
+      }
+    }
+    t1 = NowMicros();
+    LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
+  }
+
+  if (FLAGS_restart_round == 1) {
+    if (output_data != nullptr) {
+      std::string
+        output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node);
+      ofstream out_file(output_name, ios::binary);
+      out_file.write((const char *) (output_data.get()),
+                     output_size * sizeof(float));
+      out_file.flush();
+      out_file.close();
+      LOG(INFO) << "Write output file "
+                << output_name
+                << " with size " << output_size
+                << " done.";
+    } else {
+      LOG(INFO) << "Output data is null";
+    }
+  }
+
+  return true;
+}
+
+bool MultipleInputOrOutput(const std::vector<std::string> &input_names,
+                           const std::vector<std::vector<int64_t>> &input_shapes,
+                           const std::vector<std::string> &output_names,
+                           const std::vector<std::vector<int64_t>> &output_shapes) {
+  // load model
+  int64_t t0 = NowMicros();
+  const unsigned char *model_data =
+      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+  int64_t t1 = NowMicros();
+  LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
+  int64_t init_micros = t1 - t0;
+
+  DeviceType device_type = ParseDeviceType(FLAGS_device);
+  LOG(INFO) << "Runing with device type: " << device_type;
+
+  // Init model
+  LOG(INFO) << "Run init";
+  t0 = NowMicros();
+  mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
+  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+  }
+  t1 = NowMicros();
+  init_micros += t1 - t0;
+  LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
+  LOG(INFO) << "Total init latency: " << init_micros << " us";
+
+  const size_t input_count = input_names.size();
+  const size_t output_count = output_names.size();
+  std::vector<mace::MaceInputInfo> input_infos(input_count);
+  std::map<std::string, float*> outputs;
+  std::vector<std::unique_ptr<float[]>> input_datas(input_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    // Allocate input and output
+    int64_t input_size =
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    input_datas[i].reset(new float[input_size]);
+    // load input
+    ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), ios::in | ios::binary);
+    if (in_file.is_open()) {
+      in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
+                   input_size * sizeof(float));
+      in_file.close();
+    } else {
+      LOG(INFO) << "Open input file failed";
+      return -1;
+    }
+    input_infos[i].name = input_names[i];
+    input_infos[i].shape = input_shapes[i];
+    input_infos[i].data = input_datas[i].get();
+  }
+  std::vector<std::unique_ptr<float[]>> output_datas(output_count);
+  for (size_t i = 0; i < output_count; ++i) {
+    int64_t output_size =
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    output_datas[i].reset(new float[output_size]);
+    outputs[output_names[i]] = output_datas[i].get();
+  }
+
+  LOG(INFO) << "Warm up run";
+  t0 = NowMicros();
+  engine.Run(input_infos, outputs);
+  t1 = NowMicros();
+  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
+
+  if (FLAGS_round > 0) {
+    LOG(INFO) << "Run model";
+    t0 = NowMicros();
+    struct mallinfo prev = mallinfo();
+    for (int i = 0; i < FLAGS_round; ++i) {
+      engine.Run(input_infos, outputs);
+      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
+        LOG(INFO) << "=== check malloc info change #" << i << " ===";
+        prev = LogMallinfoChange(prev);
+      }
+    }
+    t1 = NowMicros();
+    LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
+  }
+
+  for (size_t i = 0; i < output_count; ++i) {
+    std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]);
+    ofstream out_file(output_name, ios::binary);
+    int64_t output_size =
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    out_file.write((const char *) outputs[output_names[i]],
+                   output_size * sizeof(float));
+    out_file.flush();
+    out_file.close();
+    LOG(INFO) << "Write output file "
+              << output_name
+              << " with size " << output_size
+              << " done.";
+  }
+
+  return true;
+}
+
+int main(int argc, char **argv) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "mace version: " << MaceVersion();
+  LOG(INFO) << "mace git version: " << MaceGitVersion();
+  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
+  LOG(INFO) << "input node: " << FLAGS_input_node;
+  LOG(INFO) << "input shape: " << FLAGS_input_shape;
+  LOG(INFO) << "output node: " << FLAGS_output_node;
+  LOG(INFO) << "output shape: " << FLAGS_output_shape;
+  LOG(INFO) << "input_file: " << FLAGS_input_file;
+  LOG(INFO) << "output_file: " << FLAGS_output_file;
+  LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
+  LOG(INFO) << "device: " << FLAGS_device;
+  LOG(INFO) << "round: " << FLAGS_restart_round;
+  LOG(INFO) << "restart_round: " << FLAGS_round;
+
+  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = str_util::Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes = str_util::Split(FLAGS_output_shape, ':');
+
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<vector<int64_t>> input_shape_vec(input_count);
+  std::vector<vector<int64_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+
+  bool ret;
+#pragma omp parallel for
+  for (int i = 0; i < FLAGS_restart_round; ++i) {
+    VLOG(0) << "restart round " << i;
+    if (input_count == 1 && output_count == 1) {
+      ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]);
+    } else {
+      ret = MultipleInputOrOutput(input_names,
+                                  input_shape_vec,
+                                  output_names,
+                                  output_shape_vec);
+    }
+  }
+  if(ret) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
diff --git a/mace/proto/BUILD b/mace/proto/BUILD
index 8649197b94508615dd395a991bccfe5205042804..5222b06bda6e1681b15ac7f60317376c5d34fa3d 100644
--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
@@ -10,6 +10,15 @@ licenses(["notice"])  # Apache 2.0
 
 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
 
+py_proto_library(
+    name = "mace_py",
+    srcs = ["mace.proto"],
+    default_runtime = "@com_google_protobuf//:protobuf_python",
+    protoc = "@com_google_protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_protobuf//:protobuf_python"],
+)
+
 py_proto_library(
     name = "caffe_py",
     srcs = ["caffe.proto"],
diff --git a/mace/proto/caffe.proto b/mace/proto/caffe.proto
index f1f99e5eba428ab9d7159e49b7ff6256323ea719..22764abc33fda32026bf436b685d79aa18ade460 100644
--- a/mace/proto/caffe.proto
+++ b/mace/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 41 (last added: type)
+// SolverParameter next available ID: 43 (last added: weights)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -128,8 +128,7 @@ message SolverParameter {
   // The states for the train/test nets. Must be unspecified or
   // specified once per net.
   //
-  // By default, all states will have solver = true;
-  // train_state will have phase = TRAIN,
+  // By default, train_state will have phase = TRAIN,
   // and all test_state's will have phase = TEST.
   // Other defaults are set according to the NetState defaults.
   optional NetState train_state = 26;
@@ -187,7 +186,11 @@ message SolverParameter {
   optional float clip_gradients = 35 [default = -1];
 
   optional int32 snapshot = 14 [default = 0]; // The snapshot interval
-  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // The prefix for the snapshot.
+  // If not set then is replaced by prototxt file path without extention.
+  // If is set to directory then is augmented by prototxt file name
+  // without extention.
+  optional string snapshot_prefix = 15;
   // whether to snapshot diff in the results or not. Snapshotting diff will help
   // debugging but the final protocol buffer size will be much larger.
   optional bool snapshot_diff = 16 [default = false];
@@ -219,7 +222,7 @@ message SolverParameter {
 
   // RMSProp decay value
   // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.
@@ -239,6 +242,19 @@ message SolverParameter {
   }
   // DEPRECATED: use type instead of solver_type
   optional SolverType solver_type = 30 [default = SGD];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [default = true];
+
+  // Path to caffemodel file(s) with pretrained weights to initialize finetuning.
+  // Tha same as command line --weights parameter for caffe train command.
+  // If command line --weights parameter if specified, it has higher priority
+  // and owerwrites this one(s).
+  // If --snapshot command line parameter is specified, this one(s) are ignored.
+  // If several model files are expected, they can be listed in a one 
+  // weights parameter separated by ',' (like in a command string) or
+  // in repeated weights parameters separately.
+  repeated string weights = 42;
 }
 
 // A message that stores the solver snapshots
@@ -389,16 +405,12 @@ message LayerParameter {
   optional PoolingParameter pooling_param = 121;
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
-  optional PSROIPoolingParameter psroi_pooling_param = 149;
-  optional PSROIAlignParameter psroi_align_param = 1490;
   optional PythonParameter python_param = 130;
   optional RecurrentParameter recurrent_param = 146;
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
-  optional ROIPoolingParameter roi_pooling_param = 8266711;
   optional ScaleParameter scale_param = 142;
-  optional ProposalParameter proposal_param = 8266713;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
@@ -407,8 +419,6 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
-
-  optional NNPACKConvolutionParameter nnpack_convolution_param = 204;
 }
 
 // Message that stores parameters used to apply transformation
@@ -424,7 +434,7 @@ message TransformationParameter {
   optional uint32 crop_size = 3 [default = 0];
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
+  // if specified can be repeated once (would subtract it from all the channels)
   // or can be repeated the same number of times as channels
   // (would subtract them from the corresponding channel)
   repeated float mean_value = 5;
@@ -440,7 +450,7 @@ message LossParameter {
   optional int32 ignore_label = 1;
   // How to normalize the loss for loss layers that aggregate across batches,
   // spatial dimensions, or other dimensions.  Currently only implemented in
-  // SoftmaxWithLoss layer.
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
   enum NormalizationMode {
     // Divide by the number of examples in the batch times spatial dimensions.
     // Outputs that receive the ignore label will NOT be ignored in computing
@@ -454,6 +464,8 @@ message LossParameter {
     // Do not normalize the loss.
     NONE = 3;
   }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
   optional NormalizationMode normalization = 3 [default = VALID];
   // Deprecated.  Ignored if normalization is specified.  If normalization
   // is not specified, then setting this to false will be equivalent to
@@ -504,11 +516,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.
@@ -590,7 +612,6 @@ message ConvolutionParameter {
     DEFAULT = 0;
     CAFFE = 1;
     CUDNN = 2;
-    NNPACK = 3;
   }
   optional Engine engine = 15 [default = DEFAULT];
 
@@ -660,8 +681,8 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
   optional uint32 prefetch = 10 [default = 4];
 }
 
@@ -808,6 +829,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
@@ -825,13 +847,6 @@ message InnerProductParameter {
   // of the weight matrix. The weight matrix itself is not going to be transposed
   // but rather the transfer flag of operations will be toggled accordingly.
   optional bool transpose = 6 [default = false];
-
-  enum Engine {
-    DEFAULT = 0;
-    CAFFE = 1;
-    NNPACK = 2;
-  }
-  optional Engine engine = 7 [default = DEFAULT];
 }
 
 message InputParameter {
@@ -915,7 +930,6 @@ message PoolingParameter {
     DEFAULT = 0;
     CAFFE = 1;
     CUDNN = 2;
-    NNPACK = 3;
   }
   optional Engine engine = 11 [default = DEFAULT];
   // If global_pooling then it will pool over the size of the bottom by doing
@@ -930,17 +944,6 @@ message PowerParameter {
   optional float shift = 3 [default = 0.0];
 }
 
-message PSROIPoolingParameter {
-   required float spatial_scale = 1; 
-   required int32 output_dim = 2; // output channel number
-   required int32 group_size = 3; // number of groups to encode position-sensitive score maps
-}
-message PSROIAlignParameter {
-   required float spatial_scale = 1; 
-   required int32 output_dim = 2; // output channel number
-   required int32 group_size = 3; // number of groups to encode position-sensitive score maps
-}
-
 message PythonParameter {
   optional string module = 1;
   optional string layer = 2;
@@ -949,9 +952,7 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 
@@ -1083,17 +1084,6 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
-// Message that stores parameters used by ROIPoolingLayer
-message ROIPoolingParameter {
-  // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
-  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
-  // Multiplicative spatial scale factor to translate ROI coords from their
-  // input scale to the scale used when pooling
-  optional float spatial_scale = 3 [default = 1];
-}
-
 message ScaleParameter {
   // The first axis of bottom[0] (the first input Blob) along which to apply
   // bottom[1] (the second input Blob).  May be negative to index from the end
@@ -1131,13 +1121,6 @@ message ScaleParameter {
   optional FillerParameter bias_filler = 5;
 }
 
-// Message that stores parameters used by ProposalLayer
-message ProposalParameter {
-  optional uint32 feat_stride = 1 [default = 16]; 
-  repeated uint32 scales = 2; 
-  repeated float ratios = 3; 
-}
-
 message SigmoidParameter {
   enum Engine {
     DEFAULT = 0;
@@ -1438,22 +1421,6 @@ message PReLUParameter {
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerParameter filler = 1;
-  // Whether or not slope paramters are shared across channels.
+  // Whether or not slope parameters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
-
-message NNPACKConvolutionParameter {
-  enum Algorithm {
-    AUTO = 0;
-    WINOGRAD = 1;
-    FFT_16x16 = 2;
-    FFT_8x8 = 3;
-  }
-  optional Algorithm algorithm = 1 [default=AUTO];
-  enum KernelTransformStrategy {
-    RECOMPUTE = 0;
-    REUSE = 1;
-  }
-  optional KernelTransformStrategy kernel_transform_strategy = 2 [default=RECOMPUTE];
-}
-
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c3744edafe7adda5acd6096aec50e70880bb2d44
--- /dev/null
+++ b/mace/proto/mace.proto
@@ -0,0 +1,137 @@
+syntax = "proto2";
+
+package mace;
+
+enum NetMode {
+  INIT   = 0;
+  NORMAL = 1;
+}
+
+enum DeviceType {
+  CPU    = 0;                    // In default, we will use CPU.
+  NEON   = 1;
+  OPENCL = 2;
+}
+
+enum DataType {
+  DT_INVALID = 0;
+
+  // Data types that all computation devices are expected to be
+  // capable to support.
+  DT_FLOAT = 1;
+  DT_DOUBLE = 2;
+  DT_INT32 = 3;
+  DT_UINT8 = 4;
+  DT_INT16 = 5;
+  DT_INT8 = 6;
+  DT_STRING = 7;
+  DT_INT64 = 8;
+  DT_UINT16 = 9;
+  DT_BOOL = 10;
+  DT_HALF = 19;
+  DT_UINT32 = 22;
+}
+
+message TensorProto {
+  // The dimensions in the tensor.
+  repeated int64 dims = 1;
+  optional DataType data_type = 2 [default = DT_FLOAT];
+  // For float
+  repeated float float_data = 3 [packed = true];
+  // For int32, uint8, int8, uint16, int16, bool, and float16
+  // Note about float16: in storage we will basically convert float16 byte-wise
+  // to unsigned short and then store them in the int32_data field.
+  repeated int32 int32_data = 4 [packed = true];
+  // For bytes
+  optional bytes byte_data = 5;
+  // For strings
+  repeated bytes string_data = 6;
+  // For double
+  repeated double double_data = 9 [packed = true];
+  // For int64
+  repeated int64 int64_data = 10 [packed = true];
+  // Optionally, a name for the tensor.
+  optional string name = 7;
+
+  optional uint32 node_id = 100;
+}
+
+message Argument {
+  optional string name = 1;
+  optional float f = 2;
+  optional int64 i = 3;
+  optional bytes s = 4;
+  repeated float floats = 5;
+  repeated int64 ints = 6;
+  repeated bytes strings = 7;
+}
+
+// for hexagon mace-nnlib
+message NodeInput {
+  optional int32 node_id = 1;
+  optional int32 output_port = 2;
+}
+
+message OutputShape {
+  repeated int64 dims = 1;
+}
+
+message OperatorDef {
+  repeated string input = 1;
+  repeated string output = 2;
+  optional string name = 3;
+  optional string type = 4;
+  repeated Argument arg = 5;
+  repeated OutputShape output_shape = 6;
+  repeated DataType output_type = 7;
+
+  repeated int32 mem_id = 10;
+
+  // for hexagon mace-nnlib
+  optional uint32 node_id = 100;
+  optional uint32 op_id = 101;
+  optional uint32 padding = 102;
+  repeated NodeInput node_input = 103;
+  repeated int32 out_max_byte_size = 104; // only support 32-bit len
+}
+
+// for memory optimization
+message MemoryBlock {
+  optional int32 mem_id = 1;
+  optional uint32 x = 2;
+  optional uint32 y = 3;
+}
+message MemoryArena {
+  repeated MemoryBlock mem_block = 1;
+}
+
+// for hexagon mace-nnlib
+message InputInfo {
+  optional string name = 1;
+  optional int32 node_id = 2;
+  repeated int32 dims = 3;
+  optional int32 max_byte_size = 4; // only support 32-bit len
+  optional DataType data_type = 5 [default = DT_FLOAT];
+}
+message OutputInfo {
+  optional string name = 1;
+  optional int32 node_id = 2;
+  repeated int32 dims = 3;
+  optional int32 max_byte_size = 4; // only support 32-bit len
+  optional DataType data_type = 5 [default = DT_FLOAT];
+}
+
+message NetDef {
+  optional string name = 1;
+  repeated OperatorDef op = 2;
+  optional string version = 3;
+  repeated Argument arg = 4;
+  repeated TensorProto tensors = 5;
+
+  // for mem optimization
+  optional MemoryArena mem_arena = 10;
+
+  // for hexagon mace-nnlib
+  repeated InputInfo input_info = 100;
+  repeated OutputInfo output_info = 101;
+}
diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD
index 4b898b54411ea02f54e588b32a81e986610cee66..08e7bca4f72def244e292db3099526b9a06687de 100644
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -1,8 +1,58 @@
-py_binary(
-    name = "caffe_ops_stats",
-    srcs = ["caffe_ops_stats.py"],
+py_library(
+    name = "tf_converter_lib",
+    srcs = [
+        "convert_util.py",
+        "graph_util.py",
+        "tf_converter_lib.py",
+        "tf_dsp_converter_lib.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":memory_optimizer",
+        "//mace/proto:mace_py",
+    ],
+)
+
+py_library(
+    name = "caffe_converter_lib",
+    srcs = [
+        "caffe_converter_lib.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":memory_optimizer",
         "//mace/proto:caffe_py",
     ],
 )
+
+py_library(
+    name = "source_converter_lib",
+    srcs = [
+        "source_converter_lib.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//mace/proto:mace_py",
+    ],
+)
+
+py_binary(
+    name = "converter",
+    srcs = ["converter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_converter_lib",
+        ":caffe_converter_lib",
+        ":source_converter_lib",
+        "@six_archive//:six",
+    ],
+)
+
+py_binary(
+    name = "memory_optimizer",
+    srcs = ["memory_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//mace/proto:mace_py",
+    ],
+)
diff --git a/mace/python/tools/binary_codegen.py b/mace/python/tools/binary_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea06a0a1da060051cdf4b97ac93058e8241f3a5
--- /dev/null
+++ b/mace/python/tools/binary_codegen.py
@@ -0,0 +1,89 @@
+import argparse
+import os
+import sys
+import struct
+
+import jinja2
+
+import numpy as np
+
+# python mace/python/tools/binary_codegen.py \
+#     --binary_dirs=${BIN_FILE} \
+#     --binary_file_name=mace_run.config \
+#     --output_path=${CODE_GEN_PATH} --variable_name=kTuningParamsData
+
+FLAGS = None
+
+
+def generate_cpp_source():
+  data_map = {}
+  for binary_dir in FLAGS.binary_dirs.split(","):
+    binary_path = os.path.join(binary_dir, FLAGS.binary_file_name)
+    if not os.path.exists(binary_path):
+      continue
+
+    with open(binary_path, "rb") as f:
+      binary_array = np.fromfile(f, dtype=np.uint8)
+
+    idx = 0
+    size, = struct.unpack("Q", binary_array[idx:idx+8])
+    print size
+    idx += 8
+    for _ in xrange(size):
+      key_size, = struct.unpack("i", binary_array[idx:idx+4])
+      idx += 4
+      key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
+      idx += key_size
+      params_size, = struct.unpack("i", binary_array[idx:idx+4])
+      idx += 4
+      data_map[key] = []
+      count = params_size / 4
+      params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size])
+      for i in params:
+        data_map[key].append(i)
+      idx += params_size
+
+  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+  return env.get_template('str2vec_maps.cc.tmpl').render(
+    maps = data_map,
+    data_type = 'unsigned int',
+    variable_name = FLAGS.variable_name
+  )
+
+def main(unused_args):
+  cpp_binary_source = generate_cpp_source()
+  if os.path.isfile(FLAGS.output_path):
+    os.remove(FLAGS.output_path)
+  w_file = open(FLAGS.output_path, "w")
+  w_file.write(cpp_binary_source)
+  w_file.close()
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--binary_dirs",
+      type=str,
+      default="cl_bin0/,cl_bin1/",
+      help="The binaries file path.")
+  parser.add_argument(
+      "--binary_file_name",
+      type=str,
+      default="mace_run.config",
+      help="The binary file name.")
+  parser.add_argument(
+      "--output_path",
+      type=str,
+      default="",
+      help="The path of generated C++ source file which contains the binary.")
+  parser.add_argument(
+    "--variable_name",
+    type=str,
+    default="kTuningParamsData",
+    help="global variable name.")
+  return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbbe9f9f01f78059910cff7ea1cf6e8f490f604a
--- /dev/null
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -0,0 +1,752 @@
+from mace.proto import mace_pb2
+from mace.proto import caffe_pb2
+from mace.python.tools import memory_optimizer
+import google.protobuf.text_format
+import numpy as np
+import math
+
+pooling_type_mode = {
+  'AvgPool': 1,
+  'MaxPool': 2
+}
+
+buffer_type_map = {
+  'CONV2D_FILTER' : 0,
+  'IN_OUT_CHANNEL' : 1,
+  'ARGUMENT' : 2,
+  'IN_OUT_HEIGHT' : 3,
+  'IN_OUT_WIDTH' : 4,
+  'WINOGRAD_FILTER' : 5,
+  'DW_CONV2D_FILTER' : 6,
+  'WEIGHT_HEIGHT' : 7,
+}
+
+data_type_map = {
+  'DT_HALF' : mace_pb2.DT_HALF,
+  'DT_FLOAT': mace_pb2.DT_FLOAT
+}
+
+activation_name_map = {
+  'ReLU' : 'RELU',
+  'Sigmoid' : 'SIGMOID',
+  'TanH' : 'TANH',
+}
+
+MACE_INPUT_NODE_NAME = "mace_input_node"
+MACE_OUTPUT_NODE_NAME = "mace_output_node"
+
+OPENCL_IMAGE_MAX_SIZE = 16384
+
+class Operator(object):
+  def __init__(self, name, type, layer):
+    self.name = name
+    self.type = type
+    self.layer = layer
+    self.parents = []
+    self.children = []
+    self.data = []
+    self.output_shape_map = {}
+
+  def add_parent(self, parent_op):
+    self.parents.append(parent_op)
+    parent_op.children.append(self)
+
+  def get_single_parent(self):
+    if len(self.parents) != 1:
+      raise Exception('Operation %s expected single parent, but got %s'
+                      % (self.name, len(self.parents)))
+    return self.parents[0]
+
+def BlobToNPArray(blob):
+  if blob.num != 0:
+    return (np.asarray(blob.data, dtype=np.float32).
+            reshape((blob.num, blob.channels, blob.height, blob.width)))
+  else:
+    return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim)
+
+
+class Shapes(object):
+  @staticmethod
+  def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func):
+    output_shape = np.zeros_like(input_shape)
+    output_shape[0] = input_shape[0]
+    output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
+                       - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+    output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
+                       - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+    output_shape[3] = filter_shape[2]
+    return output_shape
+
+  @staticmethod
+  def fully_connected_shape(input_shape, weight_shape):
+    return [input_shape[0], 1, 1, weight_shape[0]]
+
+  @staticmethod
+  def concat_shape(input_shapes, axis):
+    output_shape = None
+    for input_shape in input_shapes:
+      if output_shape is None:
+        output_shape = list(input_shape)
+      else:
+        output_shape[axis] += input_shape[axis]
+    return output_shape
+
+  @staticmethod
+  def slice_shape(input_shape, num_output):
+    return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
+
+# outputs' name is [op.name + '_' + #]
+class CaffeConverter(object):
+  def __init__(self, caffe_net, weights, net_def, dt, device, winograd):
+    self.net_def = net_def
+    self.caffe_net = caffe_net
+    self.weights = weights
+    self.dt = dt
+    self.device = device
+    self.winograd = winograd
+    self.resolved_ops = set()
+    self.ops = []
+    self.inputs_map = {} # caffe op name -> mace inputs' name
+
+    # Add Input operations
+    top_name_map = {}
+    inputs = caffe_net.input
+    for input in inputs:
+      self.ops.extend([Operator(input, 'Input', None)])
+      top_name_map[input] = input
+
+    layers = caffe_net.layer
+    # remove train layers and dropout
+    layers = self.remove_unused_layers(layers)
+
+    # Construct graph
+    # Only support single-output layer
+    # layer with single output often use the same top name.
+    self.ops.extend([Operator(layer.name, layer.type, layer) for layer in layers])
+
+    self.ops_map = {op.name : op for op in self.ops}
+    output_op_map = {}
+    for layer in layers:
+      op = self.ops_map[layer.name]
+      for input_name in layer.bottom:
+        assert input_name != layer.name
+        parent_op = output_op_map.get(input_name)
+        if parent_op is None:
+          parent_op = self.ops_map[input_name]
+        op.add_parent(parent_op)
+        if op.name not in self.inputs_map:
+          self.inputs_map[op.name] = []
+        self.inputs_map[op.name].extend([top_name_map[input_name]])
+      for i in range(len(layer.top)):
+        output_name = layer.top[i]
+        if len(layer.top) == 1:
+          top_name_map[output_name] = op.name
+        else:
+          top_name_map[output_name] = op.name + '_' + str(i)
+        if output_name == layer.name:
+          continue
+        output_op_map[output_name] = op
+
+
+    # Load weights
+    weights_layers = weights.layer
+    for layer in weights_layers:
+      if not layer.blobs:
+        continue
+      if layer.name in self.ops_map:
+        op = self.ops_map[layer.name]
+        op.data = [BlobToNPArray(blob) for blob in layer.blobs]
+
+    # toposort ops
+    self.ops = self.toposort_ops()
+
+  def CommonConvert(self, op, mace_type):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    op_def.name = op.name
+    op_def.type = mace_type
+    op_def.input.extend([name+':0' for name in self.inputs_map[op.name]])
+    return op_def
+
+  def remove_unused_layers(self, layers):
+    phase_map = {0: 'train', 1: 'test'}
+    test_layers_names = set()
+    test_layers = []
+    for layer in layers:
+      phase = 'test'
+      if len(layer.include):
+        phase = phase_map[layer.include[0].phase]
+      if len(layer.exclude):
+        phase = phase_map[layer.exclude[0].phase]
+      if phase == 'test' and layer.type != 'Dropout':
+        test_layers.append(layer)
+        assert layer.name not in test_layers_names
+        test_layers_names.add(layer.name)
+    return test_layers
+
+  def toposort_ops(self):
+    sorted_ops = []
+    temp_visited = set()
+    visited = set()
+
+    def search(op):
+      if op.name in temp_visited:
+        raise Exception("The model is not DAG")
+      if op.name in visited:
+        return
+      temp_visited.add(op.name)
+      for parent_op in op.parents:
+        search(parent_op)
+      temp_visited.remove(op.name)
+      sorted_ops.append(op)
+      visited.add(op.name)
+
+    for op in self.ops:
+      search(op)
+
+    return sorted_ops
+
+  def add_buffer_to_image(self, input_name, input_type):
+    output_name = input_name[:-2] + "_b2i" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'BufferToImage'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'mode'
+    arg.i = 0
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+  def add_image_to_buffer(self, input_name, input_type):
+    output_name = input_name[:-2] + "_i2b" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'ImageToBuffer'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+  def add_input_transform(self, names, is_single):
+    for name in names:
+      if is_single:
+        new_input_name = MACE_INPUT_NODE_NAME + ":0"
+      else:
+        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = name
+      op_def.type = 'BufferToImage'
+      op_def.input.extend([new_input_name])
+      op_def.output.extend([name+':0'])
+
+      epsilon_arg = op_def.arg.add()
+      epsilon_arg.name = 'buffer_type'
+      epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
+
+      arg = op_def.arg.add()
+      arg.name = 'T'
+      arg.i = self.dt
+
+  def add_output_transform(self, names, is_single):
+    for name in names:
+      if is_single:
+        output_name = MACE_OUTPUT_NODE_NAME + ":0"
+      else:
+        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = output_name[:-2]
+      op_def.type = 'ImageToBuffer'
+      op_def.input.extend([name+':0'])
+      op_def.output.extend([output_name])
+
+      epsilon_arg = op_def.arg.add()
+      epsilon_arg.name = 'buffer_type'
+      epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
+
+  def add_tensor(self, name, value):
+    tensor = self.net_def.tensors.add()
+    tensor.name = name
+
+    shape = list(value.shape)
+    tensor.dims.extend(shape)
+
+    tensor.data_type = mace_pb2.DT_FLOAT
+    tensor.float_data.extend(value.flat)
+
+  @staticmethod
+  def add_output_shape(op_def, output_shape):
+    mace_output_shape = mace_pb2.OutputShape()
+    mace_output_shape.dims.extend(output_shape)
+    op_def.output_shape.extend([mace_output_shape])
+
+  def add_stride_pad_kernel_arg(self, param, op_def):
+    try:
+      if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(param.pad) > 1:
+        raise Exception('Mace does not support multiple stride/kernel_size/pad')
+      stride = [param.stride[0], param.stride[0]] if len(param.stride) else [1, 1]
+      pad = [param.pad[0] * 2, param.pad[0] * 2] if len(param.pad) else [0, 0]
+      kernel = [param.kernel_size[0], param.kernel_size[0]] if len(param.kernel_size) else [0, 0]
+    except TypeError:
+      stride = [param.stride, param.stride]
+      pad = [param.pad * 2, param.pad * 2]
+      kernel = [param.kernel_size, param.kernel_size]
+
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    if param.HasField("stride_h") or param.HasField("stride_w"):
+      stride = [param.stride_h, param.stride_w]
+    strides_arg.ints.extend(stride)
+    # Pad
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding_values'
+    if param.HasField("pad_h") or param.HasField("pad_w"):
+      pad = [param.pad_h * 2, param.pad_w * 2]
+    padding_arg.ints.extend(pad)
+    # kernel
+    if op_def.type == 'Pooling':
+      kernel_arg = op_def.arg.add()
+      kernel_arg.name = 'kernels'
+      if param.HasField("kernel_h") or param.HasField("kernel_w"):
+        kernel = [param.kernel_h, param.kernel_w]
+      kernel_arg.ints.extend(kernel)
+    return pad, stride, kernel
+
+  def convert_conv2d(self, op):
+    op_def = self.CommonConvert(op, 'Conv2D')
+    param = op.layer.convolution_param
+
+    # Add filter
+    weight_tensor_name = op.name + '_weight:0'
+    weight_data = op.data[0].transpose((2, 3, 0, 1))
+    self.add_tensor(weight_tensor_name, weight_data)
+
+    if self.device == 'gpu':
+      buffer_type = "CONV2D_FILTER"
+      output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([weight_tensor_name])
+
+    # Add Bias
+    if len(op.data) == 2:
+      bias_tensor_name = op.name + '_bias:0'
+      bias_data = op.data[1].reshape(-1)
+      self.add_tensor(bias_tensor_name, bias_data)
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([bias_tensor_name])
+
+    paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def)
+    dilations = [1, 1]
+    if len(param.dilation) > 0:
+      dilation_arg = op_def.arg.add()
+      dilation_arg.name = 'dilations'
+      if len(param.dilation) == 1:
+        dilations = [param.dilation[0], param.dilation[0]]
+      elif len(param.dilation) == 2:
+        dilations = [param.dilation[0], param.dilation[1]]
+      dilation_arg.ints.extend(dilations)
+    final_op = op
+    self.resolved_ops.add(op.name)
+
+    output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
+                                          weight_data.shape,
+                                          paddings, strides, dilations,
+                                          math.floor)
+    op.output_shape_map[op.layer.top[0]] = output_shape
+
+    if len(self.ops_map[final_op.name].children) == 1 \
+        and self.ops_map[final_op.name].children[0].type in activation_name_map:
+      activation_op = self.ops_map[final_op.name].children[0]
+      op_def.type = "FusedConv2D"
+      fused_act_arg = op_def.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      final_op = activation_op
+      final_op.output_shape_map[final_op.layer.top[0]] = output_shape
+      self.resolved_ops.add(activation_op.name)
+
+    op_def.output.extend([final_op.name+':0'])
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+
+  def convert_batchnorm(self, op):
+    if len(op.children) != 1 or op.children[0].type != 'Scale':
+      raise Exception('Now only support BatchNorm+Scale')
+    op_def = self.CommonConvert(op, 'FoldedBatchNorm')
+    scale_op = op.children[0]
+
+    epsilon_value = op.layer.batch_norm_param.eps
+    if op.data[2][0] != 0:
+      mean_value = (1. / op.data[2][0]) * op.data[0]
+      var_value = (1. / op.data[2][0]) * op.data[1]
+    else:
+      raise RuntimeError('scalar is zero.')
+
+    gamma_value = scale_op.data[0]
+    beta_value = np.zeros_like(mean_value)
+    if len(scale_op.data) == 2:
+      beta_value = scale_op.data[1]
+
+    scale_value = (
+      (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
+      gamma_value).reshape(-1)
+    offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
+    input_names = [op.name+'_scale:0', op.name+'_offset:0']
+    self.add_tensor(input_names[0], scale_value)
+    self.add_tensor(input_names[1], offset_value)
+
+    if self.device == 'gpu':
+      for name in input_names:
+        output_name = self.add_buffer_to_image(name, "ARGUMENT")
+        op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([name for name in input_names])
+
+    self.resolved_ops.add(op.name)
+    self.resolved_ops.add(scale_op.name)
+    final_op = scale_op
+
+    output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+
+    if len(self.ops_map[final_op.name].children) == 1 \
+        and self.ops_map[final_op.name].children[0].type in activation_name_map:
+      activation_op = self.ops_map[final_op.name].children[0]
+      fused_act_arg = op_def.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      final_op = activation_op
+      final_op.output_shape_map[final_op.layer.top[0]] = output_shape
+      self.resolved_ops.add(activation_op.name)
+
+    op_def.output.extend([final_op.name + ':0'])
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+
+  def convert_inner_product(self, op):
+    param = op.layer.inner_product_param
+    try:
+      if param.axis != 1 or param.transpose:
+        raise ValueError('Do not support non-default axis and transpose '
+                         'case for innner product')
+    except AttributeError:
+      pass
+
+    op_def = self.CommonConvert(op, 'FC')
+    weight_tensor_name = op.name + '_weight:0'
+    if op.data[0].ndim not in [2, 4]:
+      raise ValueError('Unexpected weigth ndim.')
+    if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
+      raise ValueError('Do not support 4D weight with shape [1, 1, *, *]')
+    input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+
+    weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
+    assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
+    weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
+    weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
+    self.add_tensor(weight_tensor_name, weight_data)
+    if self.device == 'gpu':
+      if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
+          or weight_data.shape[1] > OPENCL_IMAGE_MAX_SIZE:
+        raise Exception('Mace gpu do not support FC with weight shape: '
+                        +str(weight_data.shape))
+      buffer_type = "WEIGHT_HEIGHT"
+      output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([weight_tensor_name])
+
+    # Add Bias
+    if len(op.data) == 2:
+      bias_tensor_name = op.name + '_bias:0'
+      bias_data = op.data[1].reshape(-1)
+      self.add_tensor(bias_tensor_name, bias_data)
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([bias_tensor_name])
+
+    self.resolved_ops.add(op.name)
+    output_shape = Shapes.fully_connected_shape(input_shape, weight_data.shape)
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    final_op = op
+
+    if len(self.ops_map[final_op.name].children) == 1 \
+        and self.ops_map[final_op.name].children[0].type in activation_name_map:
+      activation_op = self.ops_map[final_op.name].children[0]
+      fused_act_arg = op_def.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      final_op = activation_op
+      final_op.output_shape_map[final_op.layer.top[0]] = output_shape
+      self.resolved_ops.add(activation_op.name)
+
+    op_def.output.extend([final_op.name + ':0'])
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+
+  def convert_pooling(self, op):
+    op_def = self.CommonConvert(op, 'Pooling')
+
+    param = op.layer.pooling_param
+    paddings, strides, kernels = self.add_stride_pad_kernel_arg(param, op_def)
+    if param.pool == caffe_pb2.PoolingParameter.MAX:
+      pooling_type = "MaxPool"
+    elif param.pool == caffe_pb2.PoolingParameter.AVE:
+      pooling_type = "AvgPool"
+    pooling_type_arg = op_def.arg.add()
+    pooling_type_arg.name = 'pooling_type'
+    pooling_type_arg.i = pooling_type_mode[pooling_type]
+
+    input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
+    output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
+                                          paddings, strides, [1, 1], math.ceil)
+    op.output_shape_map[op.layer.top[0]] = output_shape
+
+    op_def.output.extend([op.name + ':0'])
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_activation(self, op):
+    op_def = self.CommonConvert(op, 'Activation')
+    activation_arg = op_def.arg.add()
+    activation_arg.name = 'activation'
+    activation_arg.s = activation_name_map[op.type]
+    op_def.output.extend([op.name + ':0'])
+    output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_prelu(self, op):
+    op_def = self.CommonConvert(op, 'Activation')
+    activation_arg = op_def.arg.add()
+    activation_arg.name = 'activation'
+    activation_arg.s = 'PRELU'
+    alpha_tensor_name = op.name + '_alpha:0'
+    alpha_data = op.data[0].reshape(-1)
+    self.add_tensor(alpha_tensor_name, alpha_data)
+    if self.device == 'gpu':
+      output_name = self.add_buffer_to_image(alpha_tensor_name, "ARGUMENT")
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([alpha_tensor_name])
+    op_def.output.extend([op.name + ':0'])
+    output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_add(self, op):
+    op_def = self.CommonConvert(op, 'AddN')
+    op_def.output.extend([op.name + ':0'])
+    output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_concat(self, op):
+    op_def = self.CommonConvert(op, 'Concat')
+    axis_arg = op_def.arg.add()
+    axis_arg.name = 'axis'
+    axis_arg.i = 3
+    try:
+      if op.layer.concat_param.HasFeild('axis'):
+        axis_arg.i = op.concat_param.axis
+      elif op.layer.concat_param.HasFeild('concat_dim'):
+        axis_arg.i = op.concat_param.concat_dim
+    except AttributeError:
+      pass
+
+    input_shapes = []
+    for i in range(len(op.parents)):
+      input_shapes.append(op.parents[i].output_shape_map[op.layer.bottom[i]])
+    output_shape = Shapes.concat_shape(input_shapes, axis_arg.i)
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    op_def.output.extend([op.name + ':0'])
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_eltwise(self, op):
+    op_def = self.CommonConvert(op, 'Eltwise')
+    param = op.layer.eltwise_param
+    type_arg = op_def.arg.add()
+    type_arg.name = 'type'
+    type_arg.i = param.operation
+    if len(param.coeff) > 0:
+      coeff_arg = op_def.arg.add()
+      coeff_arg.name = 'coeff'
+      coeff_arg.ints.extend(list(param.coeff))
+
+    output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    op_def.output.extend([op.name + ':0'])
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_slice(self, op):
+    op_def = self.CommonConvert(op, 'Slice')
+    if op.layer.HasField('slice_param'):
+      param = op.layer.slice_param
+      if param.HasField('axis') and param.axis != 1:
+        raise Exception('Mace do not support slice with axis ' + str(param.axis))
+      if len(param.slice_point) > 0:
+        raise Exception('Mace do not support slice with slice_point')
+
+    input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
+    num_outputs = len(op.layer.top)
+    if (input_shape[3] % num_outputs) != 0 or \
+      (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
+      raise Exception('Mace do not support slice with input shape '
+                      + str(input_shape) + ' and number of output ' + str(num_outputs))
+    output_shape = Shapes.slice_shape(input_shape, num_outputs)
+    for i in range(len(op.layer.top)):
+      op.output_shape_map[op.layer.top[i]] = output_shape
+      self.add_output_shape(op_def, output_shape)
+      op_def.output.extend([op.name + '_' + str(i) + ':0'])
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def convert_normal_op(self, op):
+    op_def = self.CommonConvert(op, op.type)
+    output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
+    op.output_shape_map[op.layer.top[0]] = output_shape
+    self.add_output_shape(op_def, output_shape)
+    op_def.output.extend([op.name + ':0'])
+    self.net_def.op.extend([op_def])
+    self.resolved_ops.add(op.name)
+
+  def replace_in_out_name(self, input_names, output_names, is_single):
+    in_names = set([input_name + ":0" for input_name in input_names])
+    out_names = set([output_name + ":0" for output_name in output_names])
+    if is_single:
+      for op in self.net_def.op:
+        if len(op.input) > 0 and op.input[0] in in_names:
+          op.input[0] = MACE_INPUT_NODE_NAME + ':0'
+        if len(op.output) > 0 and op.output[0] in out_names:
+          op.output[0] = MACE_OUTPUT_NODE_NAME + ':0'
+    else:
+      for op in self.net_def.op:
+        if len(op.input) > 0 and op.input[0] in in_names:
+          op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
+        if len(op.output) > 0 and op.output[0] in out_names:
+          op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
+
+  def add_input_op_shape(self, input_nodes, input_shapes):
+    assert len(input_nodes) == len(input_shapes)
+    for i in range(len(input_nodes)):
+      input_op = self.ops_map[input_nodes[i]]
+      if input_op.layer is not None:
+        input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i]
+      else:
+        input_op.output_shape_map[input_op.name] = input_shapes[i]
+
+  def convert(self, input_nodes, input_shapes, output_nodes):
+    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
+    if self.device == 'gpu':
+      self.add_input_transform(input_nodes, is_single)
+
+    assert self.ops[0].type == 'Input'
+    self.add_input_op_shape(input_nodes, input_shapes)
+
+    for op in self.ops:
+      if op.name in self.resolved_ops:
+        continue
+      if op.type == 'Input':
+        self.resolved_ops.add(op.name)
+      elif op.type == 'Convolution':
+        self.convert_conv2d(op)
+      elif op.type == 'BatchNorm':
+        self.convert_batchnorm(op)
+      elif op.type == 'InnerProduct':
+        self.convert_inner_product(op)
+      elif op.type == 'Pooling':
+        self.convert_pooling(op)
+      elif op.type == 'PReLU':
+        self.convert_prelu(op)
+      elif op.type in ['ReLU', 'Sigmoid', 'TanH']:
+        self.convert_activation(op)
+      elif op.type == 'Add':
+        self.convert_add(op)
+      elif op.type == 'Concat':
+        self.convert_concat(op)
+      elif op.type == 'Eltwise':
+        self.convert_eltwise(op)
+      elif op.type in ['Softmax']:
+       self.convert_normal_op(op)
+      elif op.type == 'Slice':
+        self.convert_slice(op)
+      else:
+        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+
+    if self.device == 'gpu':
+      self.add_output_transform(output_nodes, is_single)
+
+    if self.device == 'cpu':
+      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+
+    for op in self.ops:
+      if op.name not in self.resolved_ops:
+        print 'Unresolve Op: %s with type %s' % (op.name, op.type)
+
+
+def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, output_node_str, data_type, device, winograd):
+  net_def = mace_pb2.NetDef()
+  dt = data_type_map[data_type]
+
+  caffe_net = caffe_pb2.NetParameter()
+  with open(model_file, "r") as f:
+    google.protobuf.text_format.Merge(str(f.read()), caffe_net)
+
+  weights = caffe_pb2.NetParameter()
+  with open(weight_file, "rb") as f:
+    weights.MergeFromString(f.read())
+
+  input_nodes = [x for x in input_node_str.split(',')]
+  input_shapes = []
+  if input_shape_str != "":
+    input_shape_strs = [x for x in input_shape_str.split(':')]
+    for shape_str in input_shape_strs:
+      input_shapes.extend([[int(x) for x in shape_str.split(',')]])
+  output_nodes = [x for x in output_node_str.split(',')]
+  assert len(input_nodes) == len(input_shapes)
+
+  converter = CaffeConverter(caffe_net, weights, net_def, dt, device, winograd)
+  converter.convert(input_nodes, input_shapes, output_nodes)
+  print "PB Converted."
+  if device == 'gpu':
+    print "start optimize memory."
+    mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
+    mem_optimizer.optimize()
+    print "Memory optimization done."
+
+  return net_def
diff --git a/mace/python/tools/caffe_ops_stats.py b/mace/python/tools/caffe_ops_stats.py
deleted file mode 100644
index 4eba5b664de816722d370c61757117ef0ffd25fe..0000000000000000000000000000000000000000
--- a/mace/python/tools/caffe_ops_stats.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from mace.proto import caffe_pb2
-import google.protobuf.text_format
-import operator
-import functools
-import argparse
-import sys
-import six
-import os.path
-
-FLAGS = None
-
-def main(unused_args):
-  if not os.path.isfile(FLAGS.input):
-    print 'input model file not exist'
-    return -1
-  net = caffe_pb2.NetParameter()
-  with open(FLAGS.input) as f:
-    google.protobuf.text_format.Merge(str(f.read()), net)
-
-  ops = {}
-  for layer in net.layer:
-    if layer.type not in ops:
-      ops[layer.type] = 1
-    else:
-      ops[layer.type] += 1
-
-  for key, value in sorted(ops.items(), key=operator.itemgetter(1)):
-    print key, ":", value
-
-def parse_args():
-  '''Parses command line arguments.'''
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-    '--input',
-    type=str,
-    default='',
-    help='Caffe \'GraphDef\' file to load.')
-  return parser.parse_known_args()
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parse_args()
-  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..53b3196952e991e6163b83ff3ca14395bcba9856
--- /dev/null
+++ b/mace/python/tools/convert_util.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+from mace.proto import mace_pb2
+
+TF_DTYPE_2_MACE_DTYPE_MAP = {
+    tf.float32: mace_pb2.DT_FLOAT,
+    tf.double: mace_pb2.DT_DOUBLE,
+    tf.half: mace_pb2.DT_HALF,
+    tf.int64: mace_pb2.DT_INT64,
+    tf.int32: mace_pb2.DT_INT32,
+    tf.qint32: mace_pb2.DT_INT32,
+    tf.int16: mace_pb2.DT_INT16,
+    tf.qint16: mace_pb2.DT_INT16,
+    tf.int8: mace_pb2.DT_INT8,
+    tf.qint8: mace_pb2.DT_INT8,
+    tf.quint16: mace_pb2.DT_UINT16,
+    tf.uint16: mace_pb2.DT_UINT16,
+    tf.quint8: mace_pb2.DT_UINT8,
+    tf.uint8: mace_pb2.DT_UINT8,
+    tf.string: mace_pb2.DT_STRING,
+    tf.bool: mace_pb2.DT_BOOL,
+}
+
+
+def tf_dtype_2_mace_dtype(tf_dtype):
+    mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None)
+    if not mace_dtype:
+        raise Exception("Not supported tensorflow dtype: " + tf_dtype)
+    return mace_dtype
+
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4b942e7a4b9b6745f9d765d84b9d3d4408ff17
--- /dev/null
+++ b/mace/python/tools/converter.py
@@ -0,0 +1,179 @@
+import argparse
+import sys
+import hashlib
+import os.path
+from mace.python.tools import source_converter_lib
+
+# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3
+
+FLAGS = None
+
+def file_checksum(fname):
+  hash_func = hashlib.sha256()
+  with open(fname, "rb") as f:
+    for chunk in iter(lambda: f.read(4096), b""):
+      hash_func.update(chunk)
+  return hash_func.hexdigest()
+
+def main(unused_args):
+  if not os.path.isfile(FLAGS.model_file):
+    print("Input graph file '" + FLAGS.model_file + "' does not exist!")
+    sys.exit(-1)
+
+  model_checksum = file_checksum(FLAGS.model_file)
+  if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum:
+    print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum))
+    sys.exit(-1)
+
+  if FLAGS.platform == 'caffe':
+    if not os.path.isfile(FLAGS.weight_file):
+      print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
+      sys.exit(-1)
+
+    weight_checksum = file_checksum(FLAGS.weight_file)
+    if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum:
+      print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum))
+      sys.exit(-1)
+
+    if FLAGS.runtime == 'dsp':
+      print("DSP not support caffe model yet.")
+      sys.exit(-1)
+
+    from mace.python.tools import caffe_converter_lib
+    output_graph_def = caffe_converter_lib.convert_to_mace_pb(
+      FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
+      FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
+  elif FLAGS.platform == 'tensorflow':
+    if FLAGS.runtime == 'dsp':
+      from mace.python.tools import tf_dsp_converter_lib
+      output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
+        FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode)
+    else:
+      from mace.python.tools import tf_converter_lib
+      output_graph_def = tf_converter_lib.convert_to_mace_pb(
+        FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
+        FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
+
+  if FLAGS.output_type == 'source':
+    source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
+      FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data)
+  else:
+    with open(FLAGS.output, "wb") as f:
+      f.write(output_graph_def.SerializeToString())
+    with open(FLAGS.output + '_txt', "wb") as f:
+      # output_graph_def.ClearField('tensors')
+      f.write(str(output_graph_def))
+  print("Model conversion is completed.")
+
+def str2bool(v):
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+    "--model_file",
+    type=str,
+    default="",
+    help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.")
+  parser.add_argument(
+    "--weight_file",
+    type=str,
+    default="",
+    help="Caffe data file to load.")
+  parser.add_argument(
+    "--model_checksum",
+    type=str,
+    default="",
+    help="Model file sha256 checksum")
+  parser.add_argument(
+    "--weight_checksum",
+    type=str,
+    default="",
+    help="Weight file sha256 checksum")
+  parser.add_argument(
+    "--output",
+    type=str,
+    default="",
+    help="File to save the output graph to.")
+  parser.add_argument(
+    "--runtime",
+    type=str,
+    default="cpu",
+    help="Runtime: cpu/gpu/dsp")
+  parser.add_argument(
+    "--input_node",
+    type=str,
+    default="input_node",
+    help="e.g., input_node")
+  parser.add_argument(
+    "--output_node",
+    type=str,
+    default="softmax",
+    help="e.g., softmax")
+  parser.add_argument(
+    "--data_type",
+    type=str,
+    default='DT_FLOAT',
+    help="e.g., DT_HALF/DT_FLOAT")
+  parser.add_argument(
+    "--output_type",
+    type=str,
+    default="pb",
+    help="output type: source/pb")
+  parser.add_argument(
+    "--template",
+    type=str,
+    default="",
+    help="template path")
+  parser.add_argument(
+    "--obfuscate",
+    type=str2bool,
+    nargs='?',
+    const=False,
+    default=False,
+    help="obfuscate model names")
+  parser.add_argument(
+    "--model_tag",
+    type=str,
+    default="",
+    help="model tag for generated function and namespace")
+  parser.add_argument(
+    "--winograd",
+    type=str2bool,
+    nargs='?',
+    const=False,
+    default=False,
+    help="open winograd convolution or not")
+  parser.add_argument(
+    "--dsp_mode",
+    type=int,
+    default=0,
+    help="dsp run mode, defalut=0")
+  parser.add_argument(
+    "--input_shape",
+    type=str,
+    default="",
+    help="input shape.")
+  parser.add_argument(
+    "--platform",
+    type=str,
+    default="tensorflow",
+    help="tensorflow/caffe")
+  parser.add_argument(
+    "--embed_model_data",
+    type=str2bool,
+    default=True,
+    help="input shape.")
+  return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/mace/python/tools/dsp_ops.py b/mace/python/tools/dsp_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce90a0b27235914188ba34232c7f5557d44ef75
--- /dev/null
+++ b/mace/python/tools/dsp_ops.py
@@ -0,0 +1,64 @@
+
+class DspOps(object):
+  def __init__(self):
+    self.dsp_ops = {
+      'INPUT': 'INPUT"',
+      'OUTPUT': 'OUTPUT',
+      'NoOp': 'Nop',
+      'FLATTEN': 'Flatten',
+      'Identity': 'Nop',
+      'Placeholder': 'INPUT',
+      'Const': 'Const',
+      'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
+      'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
+      'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
+      'QuantizedRelu': 'QuantizedRelu_8',
+      'QuantizedReluX': 'QuantizedReluX_8',
+      'QuantizedMaxPool': 'QuantizedMaxPool_8',
+      'QuantizedAvgPool': 'QuantizedAvgPool_8',
+      'QuantizedConcat': 'QuantizedConcat_8',
+      'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
+      'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8',
+      'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
+      'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
+      'QuantizedSoftmax': 'QuantizedSoftmax_8',
+      'Min': 'Min_f',
+      'Max': 'Max_f',
+      'QuantizeV2': 'Quantize',
+      'Dequantize': 'Dequantize',
+      'Softmax': 'Softmax_f',
+      'Reshape': 'Reshape',
+      'QuantizedReshape': 'QuantizedReshape',
+      'Sigmoid': 'Sigmoid_f',
+      'Slice': 'Slice_f',
+      'Add': 'Add_f',
+      'Mul': 'Mul_f',
+      'Requantize': 'Requantize_32to8',
+      'RequantizationRange': 'RequantizationRange_32',
+      'Sub': 'Sub_f',
+      'Pack': 'Pack_int32',
+      'StridedSlice': 'StridedSlice_f',
+      'ExpandDims': 'ExpandDims_f',
+      'QuantizedMul': 'QuantizedMul_8x8to32',
+      'QuantizedAdd': 'QuantizedAdd_8p8to32',
+      'Pad': 'Pad_f',
+      'SpaceToBatchND': 'SpaceToBatchND_f',
+      'BatchToSpaceND': 'BatchToSpaceND_f',
+      'ResizeBilinear': 'ResizeBilinear_f',
+      'ConcatV2': 'ConcatV2_f',
+      'Conv2DBackpropInput': 'Deconv_f',
+      'Tanh': 'Tanh_f',
+      'Split': 'Split_f',
+      'Transpose': 'Transpose_f',
+      'Concat': 'Concat_f',
+      'AddN': 'AddN_f',
+    }
+  def has_op(self, tf_op):
+    return tf_op in self.dsp_ops
+
+  def map_nn_op(self, tf_op):
+    if tf_op not in self.dsp_ops:
+      raise Exception('Could not map nn op for: ', tf_op)
+    return self.dsp_ops[tf_op]
+
+
diff --git a/mace/python/tools/graph_util.py b/mace/python/tools/graph_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f7e8bcc32c62f933671f85b4c116a32794abd3
--- /dev/null
+++ b/mace/python/tools/graph_util.py
@@ -0,0 +1,50 @@
+import tensorflow as tf
+from mace.proto import mace_pb2
+from collections import OrderedDict
+
+def sort_tf_node(node, nodes_map, ordered_nodes_map):
+    if node.name not in ordered_nodes_map:
+        for input_tensor_name in node.input:
+            input_node_name = input_tensor_name.split(':')[
+                0] if ':' in input_tensor_name else input_tensor_name
+            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
+                continue
+
+            input_node = nodes_map[input_node_name]
+            sort_tf_node(input_node, nodes_map, ordered_nodes_map)
+        ordered_nodes_map[node.name] = node
+
+def sort_tf_graph(graph_def):
+    nodes_map = {}
+    ordered_nodes_map = OrderedDict()
+    for node in graph_def.node:
+        nodes_map[node.name] = node
+    for node in graph_def.node:
+        sort_tf_node(node, nodes_map, ordered_nodes_map)
+    sorted_graph = tf.GraphDef()
+    sorted_graph.node.extend([node for node in ordered_nodes_map.values()])
+    return sorted_graph
+
+
+def sort_mace_node(node, nodes_map, ordered_nodes_map):
+    if node.name not in ordered_nodes_map:
+        for input_tensor_name in node.input:
+            input_node_name = input_tensor_name.split(':')[
+                0] if ':' in input_tensor_name else input_tensor_name
+            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
+                continue
+
+            input_node = nodes_map[input_node_name]
+            sort_mace_node(input_node, nodes_map, ordered_nodes_map)
+        ordered_nodes_map[node.name] = node
+
+def sort_mace_graph(graph_def, output_name):
+    nodes_map = {}
+    ordered_nodes_map = OrderedDict()
+    for node in graph_def.op:
+        nodes_map[node.name] = node
+    sort_mace_node(nodes_map[output_name], nodes_map, ordered_nodes_map)
+    sorted_graph = mace_pb2.NetDef()
+    sorted_graph.tensors.extend(graph_def.tensors)
+    sorted_graph.op.extend([node for node in ordered_nodes_map.values()])
+    return sorted_graph
diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e5716976b5a8cafdd22dceee0785b88a199bc11
--- /dev/null
+++ b/mace/python/tools/memory_optimizer.py
@@ -0,0 +1,106 @@
+import sys
+import operator
+from mace.proto import mace_pb2
+
+class MemoryOptimizer(object):
+  def __init__(self, net_def):
+    self.net_def = net_def
+    self.idle_mem = set()
+    self.op_mem = {}    # op_name->mem_id
+    self.mem_block = {} # mem_id->[x, y]
+    self.total_mem_count = 0
+    self.ref_counter = {}
+
+    consumers = {}
+    for op in net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      for ipt in op.input:
+        if ipt not in consumers:
+          consumers[ipt] = []
+        consumers[ipt].append(op)
+    # only ref op's output tensor
+    for op in net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      for output in op.output:
+        tensor_name = output
+        if tensor_name in consumers:
+          self.ref_counter[tensor_name] = len(consumers[tensor_name])
+        else:
+          self.ref_counter[tensor_name] = 0
+
+  def is_buffer_image_op(self, op):
+    return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
+
+  def get_mem_size(self, op_type, output_shape):
+    mem_size = [0, 0]
+    if op_type == 'WinogradTransform' or op_type == 'GEMM':
+      mem_size[0] = output_shape[2] * output_shape[3]
+      mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4)
+    else:
+      mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4)
+      mem_size[1] = output_shape[0] * output_shape[1]
+    return mem_size
+
+  def optimize(self):
+    for op in self.net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      if not op.output_shape:
+        print('WARNING: There is no output shape information to do memory optimization.')
+        return
+      if len(op.output_shape) != len(op.output):
+        print('WARNING: the number of output shape is not equal to the number of output.')
+        return
+      for i in range(len(op.output)):
+        if len(self.idle_mem) == 0:
+          # allocate new mem
+          mem_id = self.total_mem_count
+          self.total_mem_count += 1
+        else:
+          # reuse mem
+          mem_id = self.idle_mem.pop()
+
+        op.mem_id.extend([mem_id])
+        self.op_mem[op.output[i]] = mem_id
+        if mem_id not in self.mem_block:
+          self.mem_block[mem_id] = [0, 0]
+        mem_size = self.mem_block[mem_id]
+        op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims)
+        mem_size[0] = max(mem_size[0], op_mem_size[0])
+        mem_size[1] = max(mem_size[1], op_mem_size[1])
+
+      # de-ref input tensor mem
+      for ipt in op.input:
+        if ipt in self.ref_counter:
+          self.ref_counter[ipt] -= 1
+          if self.ref_counter[ipt] == 0:
+            self.idle_mem.add(self.op_mem[ipt])
+          elif self.ref_counter[ipt] < 0:
+            raise Exception('ref count is less than 0')
+
+    for mem in self.mem_block:
+      arena = self.net_def.mem_arena
+      block = arena.mem_block.add()
+      block.mem_id = mem
+      block.x = self.mem_block[mem][0]
+      block.y = self.mem_block[mem][1]
+
+    print('total op: %d', len(self.net_def.op))
+    origin_mem_size = 0
+    optimized_mem_size = 0
+    for op in self.net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
+    for mem in self.mem_block:
+      print mem, self.mem_block[mem]
+      optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
+
+    print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
+
+
+def optimize_memory(net_def):
+  mem_optimizer = MemoryOptimizer(net_def)
+  mem_optimizer.optimize()
diff --git a/mace/python/tools/model.template b/mace/python/tools/model.template
new file mode 100644
index 0000000000000000000000000000000000000000..8ad245ef468695444b5c790b7edb7ea0f462c973
--- /dev/null
+++ b/mace/python/tools/model.template
@@ -0,0 +1,152 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+
+#include <vector>
+#include <string>
+
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace {{tag}} {
+
+{% for tensor in tensors %}
+extern void CreateTensor{{ tensor.id }}(std::vector<mace::ConstTensor> &tensors,
+                                        const unsigned char *model_data);
+{% endfor %}
+
+
+{% for i in range(net.op|length) %}
+extern void CreateOperator{{i}}(mace::OperatorDef &op);
+{% endfor %}
+
+}  // namespace {{ tag }}
+
+namespace {
+
+{% if net.arg|length != 0 %}
+void CreateNetArg(mace::NetDef &net_def) {
+  net_def.mutable_arg().reserve({{ net.arg|length }});
+  mace::Argument *arg = nullptr;
+  {% for arg in net.arg %}
+
+  arg = net_def.add_arg();
+  arg->set_name({{ arg.name|tojson }});
+
+  {%- if arg.HasField('f') %}
+  arg->set_f({{ arg.f }});
+  {% endif %}
+
+  {%- if arg.HasField('i') %}
+  arg->set_i({{ arg.i }});
+  {% endif %}
+
+  {%- if arg.HasField('s') %}
+  arg->set_s({{ arg.s|tojson }});
+  {% endif %}
+
+  {% if arg.floats|length != 0 %}
+  arg->set_floats({ {{ arg.floats|join(', ') }} });
+  {% endif %}
+  {% if arg.ints|length != 0 %}
+  arg->set_ints({ {{ arg.ints|join(', ') }} });
+  {% endif %}
+  {% if arg.strings|length != 0 %}
+  arg->set_strings({ {{ arg.strings|stringfy() }} });
+  {% endif %}
+
+  {% endfor %}
+}
+{% endif %}
+
+{% if net.output_info | length > 0 %}
+void CreateOutputInfo(mace::NetDef &net_def) {
+	std::vector<std::vector<int>> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} };
+
+  std::vector<int> data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} };
+  std::vector<mace::DataType> data_types({{ net.output_info | length }});
+  for (int k = 0; k < {{ net.output_info | length }}; ++k) {
+    data_types[k] = static_cast<mace::DataType>(data_types_int[k]);
+  }
+  net_def.mutable_output_info().resize({{ net.output_info | length }});
+  for (int i = 0; i < {{ net.output_info | length }}; ++i) {
+    net_def.mutable_output_info()[i].set_data_type(data_types[i]);
+    net_def.mutable_output_info()[i].set_dims(dims[i]);
+  }
+}
+{% endif %}
+
+void CreateOperators(std::vector<mace::OperatorDef> &ops) {
+  MACE_LATENCY_LOGGER(1, "Create operators");
+
+  ops.resize({{ net.op|length }});
+  {% for i in range(net.op|length) %}
+
+  mace::{{tag}}::CreateOperator{{i}}(ops[{{i}}]);
+  {% endfor %}
+}
+
+void CreateTensors(std::vector<mace::ConstTensor> &tensors,
+                   const unsigned char *model_data) {
+  MACE_LATENCY_LOGGER(1, "Create tensors");
+  tensors.reserve({{ net.tensors|length }});
+
+  {% for tensor in tensors %}
+
+  mace::{{tag}}::CreateTensor{{tensor.id}}(tensors, model_data);
+  {% endfor %}
+}
+
+
+{% if net.mem_arena.mem_block|length != 0 %}
+void CreateMemoryArena(mace::MemoryArena &mem_arena) {
+  std::vector<mace::MemoryBlock> &mem_block = mem_arena.mutable_mem_block();
+  mem_block.reserve({{ net.mem_arena.mem_block|length }});
+
+  {% for mem_blk in net.mem_arena.mem_block %}
+  mem_block.emplace_back(mace::MemoryBlock({{ mem_blk.mem_id }},
+                                           {{mem_blk.x}},
+                                           {{mem_blk.y}}));
+  {% endfor %}
+
+}
+{% endif %}
+
+}  // namespace
+
+namespace {{tag}} {
+
+NetDef CreateNet(const unsigned char *model_data) {
+  MACE_LATENCY_LOGGER(1, "Create net {{ net.name }}");
+  NetDef net_def;
+  net_def.set_name("{{ net.name}}");
+  net_def.set_version("{{ net.version }}");
+
+  {% if net.arg|length != 0 %}
+  CreateNetArg(net_def);
+  {% endif %}
+
+  CreateOperators(net_def.mutable_op());
+
+  CreateTensors(net_def.mutable_tensors(), model_data);
+
+  {% if net.mem_arena.mem_block|length != 0 %}
+  CreateMemoryArena(net_def.mutable_mem_arena());
+  {% endif %}
+
+  {% if net.output_info | length > 0 %}
+  CreateOutputInfo(net_def);
+  {% endif %}
+
+  return net_def;
+}
+
+const std::string ModelChecksum() {
+  return {{ model_pb_checksum|tojson }};
+}
+
+}  // namespace {{tag}}
+}  // namespace mace
diff --git a/mace/python/tools/model_header.template b/mace/python/tools/model_header.template
new file mode 100644
index 0000000000000000000000000000000000000000..9f5c776d52bd6456bf3c410216f5b4de1ce1fa58
--- /dev/null
+++ b/mace/python/tools/model_header.template
@@ -0,0 +1,22 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+
+#include <string>
+
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace {{tag}} {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace {{ tag }}
+}  // namespace mace
diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..d510932633e5413976973b8cec697634bf4a7a05
--- /dev/null
+++ b/mace/python/tools/opencl_codegen.py
@@ -0,0 +1,69 @@
+import argparse
+import os
+import sys
+
+import numpy as np
+
+import jinja2
+
+# python mace/python/tools/opencl_codegen.py \
+#     --cl_binary_dirs=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH}
+
+FLAGS = None
+
+
+def generate_cpp_source():
+  maps = {}
+  cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",")
+  for cl_binary_dir in cl_binary_dir_arr:
+    if not os.path.exists(cl_binary_dir):
+      print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!")
+    for file_name in os.listdir(cl_binary_dir):
+      file_path = os.path.join(cl_binary_dir, file_name)
+      if file_path[-4:] == ".bin":
+        # read binary
+        f = open(file_path, "rb")
+        binary_array = np.fromfile(f, dtype=np.uint8)
+        f.close()
+
+        maps[file_name[:-4]] = []
+        for ele in binary_array:
+          maps[file_name[:-4]].append(hex(ele))
+
+  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+  return env.get_template('str2vec_maps.cc.tmpl').render(
+    maps = maps,
+    data_type = 'unsigned char',
+    variable_name = 'kCompiledProgramMap'
+  )
+
+
+def main(unused_args):
+
+  cpp_cl_binary_source = generate_cpp_source()
+  if os.path.isfile(FLAGS.output_path):
+    os.remove(FLAGS.output_path)
+  w_file = open(FLAGS.output_path, "w")
+  w_file.write(cpp_cl_binary_source)
+  w_file.close()
+
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--cl_binary_dirs",
+      type=str,
+      default="cl_bin0/,cl_bin1/,cl_bin2/",
+      help="The cl binaries directories.")
+  parser.add_argument(
+      "--output_path",
+      type=str,
+      default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
+      help="The path of generated C++ header file which contains cl binaries.")
+  return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/mace/python/tools/operator.template b/mace/python/tools/operator.template
new file mode 100644
index 0000000000000000000000000000000000000000..bdbcbbcbb3a5d228a818134d21e48fcd57fff079
--- /dev/null
+++ b/mace/python/tools/operator.template
@@ -0,0 +1,113 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+
+#include <vector>
+#include <string>
+
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace {
+
+void UpdateOp(mace::OperatorDef &op,
+              const std::string &name,
+              const std::string &type,
+              const std::vector<std::string> &inputs,
+              const std::vector<std::string> &outputs,
+              const std::vector<mace::DataType> &output_types,
+              uint32_t node_id,
+              const std::vector<int> &mem_ids) {
+  op.set_name(name);
+  op.set_type(type);
+  op.set_input(inputs);
+  op.set_output(outputs);
+  op.set_output_type(output_types);
+  op.set_node_id(node_id);
+  op.set_mem_id(mem_ids);
+}
+
+}  // namespace
+}  // namespace mace
+
+namespace mace {
+namespace {{tag}} {
+
+{% for i in range(start, end) %}
+
+void CreateOperator{{i}}(mace::OperatorDef &op) {
+  MACE_LATENCY_LOGGER(2, "Create operator {{ net.op[i].name }}");
+
+  mace::Argument *arg = nullptr;
+  {% for arg in net.op[i].arg %}
+
+  arg = op.add_arg();
+  arg->set_name({{ arg.name|tojson }});
+
+  {%- if arg.HasField('f') %}
+  arg->set_f({{ arg.f }});
+  {%- endif %}
+  {%- if arg.HasField('i') %}
+  arg->set_i({{ arg.i }});
+  {%- endif %}
+  {%- if arg.HasField('s') %}
+  arg->set_s({{ arg.s|tojson }});
+  {%- endif %}
+
+  {% if arg.floats|length != 0 %}
+  arg->set_floats({ {{ arg.floats|join(', ') }} });
+  {% endif %}
+  {% if arg.ints|length != 0 %}
+  arg->set_ints({ {{ arg.ints|join(', ') }} });
+  {% endif %}
+  {% if arg.strings|length != 0 %}
+  arg->set_strings({ {{ arg.strings|stringfy() }} });
+  {% endif %}
+  {% endfor %}
+
+  {% for shape in net.op[i].output_shape %}
+	{% if shape.dims | length > 0 %}
+  op.add_output_shape(mace::OutputShape({ {{ shape.dims|join(', ') }} }));
+	{% endif %}
+  {% endfor %}
+
+  std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
+  std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});
+  for (int k = 0; k < {{ net.op[i].output_type | length }}; ++k) {
+    output_types[k] = static_cast<mace::DataType>(output_types_int[k]);
+  }
+  UpdateOp(op, {{ net.op[i].name|tojson }}, {{ net.op[i].type|tojson}},
+          { {{ net.op[i].input|stringfy }} },
+          { {{ net.op[i].output|stringfy }} },
+          output_types,
+          {{ net.op[i].node_id }},
+          { {{ net.op[i].mem_id | join(', ') }} });
+
+  {% if runtime == 'dsp' %}
+    op.set_padding({{ net.op[i].padding }});
+    {% if net.op[i].node_input | length > 0 %}
+    std::vector<int> input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} });
+    std::vector<int> input_output_ports({ {{ net.op[i].node_input | map(attribute='output_port') | join(', ')}} });
+
+    for (size_t i = 0; i < {{ net.op[i].node_input | length }}; ++i) {
+      mace::NodeInput input(input_node_ids[i], input_output_ports[i]);
+      op.add_node_input(input);
+    }
+    {% endif %}
+    {% if net.op[i].out_max_byte_size | length > 0 %}
+    std::vector<int> out_max_byte_sizes {{ net.op[i].out_max_byte_size | replace('[', '{') | replace(']', '}') }};
+    for (size_t i = 0; i < {{ net.op[i].out_max_byte_size | length }}; ++i) {
+      op.add_out_max_byte_size(out_max_byte_sizes[i]);
+    }
+    {% endif %}
+  {% endif %}
+}
+
+{% endfor %}
+
+}  // namespace {{tag}}
+}  // namespace mace
+
diff --git a/mace/python/tools/source_converter_lib.py b/mace/python/tools/source_converter_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..52fc6126210355f4aa177319c46fe0f476680d86
--- /dev/null
+++ b/mace/python/tools/source_converter_lib.py
@@ -0,0 +1,187 @@
+import os
+import uuid
+import numpy as np
+import hashlib
+
+from mace.proto import mace_pb2
+from jinja2 import Environment, FileSystemLoader
+
+
+GENERATED_NAME = set()
+
+def generate_obfuscated_name(namespace, name):
+  md5 = hashlib.md5()
+  md5.update(namespace)
+  md5.update(name)
+  md5_digest = md5.hexdigest()
+
+  name = md5_digest[:8]
+  while name in GENERATED_NAME:
+    name = md5_digest
+    assert name not in GENERATED_NAME
+  GENERATED_NAME.add(name)
+  return name
+
+def generate_tensor_map(tensors):
+  tensor_map = {}
+  for t in tensors:
+    if not tensor_map.has_key(t.name):
+        tensor_map[t.name] = generate_obfuscated_name("tensor", t.name)
+  return tensor_map
+
+def generate_in_out_map(ops, tensor_map):
+  in_out_map = {}
+  for op in ops:
+    op.name = generate_obfuscated_name("op", op.name)
+    for input_name in op.input:
+        if not in_out_map.has_key(input_name):
+          if tensor_map.has_key(input_name):
+            in_out_map[input_name] = tensor_map[input_name]
+          else:
+            in_out_map[input_name] = generate_obfuscated_name("in", input_name)
+    for output_name in op.output:
+      if not in_out_map.has_key(output_name):
+        if tensor_map.has_key(output_name):
+          in_out_map[output_name] = tensor_map[output_name]
+        else:
+          in_out_map[output_name] = generate_obfuscated_name("out", output_name)
+  return in_out_map
+
+def obfuscate_name(net_def):
+  input_node = "mace_input_node"
+  output_node = "mace_output_node"
+  tensor_map = generate_tensor_map(net_def.tensors)
+  in_out_map = generate_in_out_map(net_def.op, tensor_map)
+  for t in net_def.tensors:
+    if input_node not in t.name and output_node not in t.name:
+      t.name = tensor_map[t.name]
+  for op in net_def.op:
+    for i in range(len(op.input)):
+      if input_node not in op.input[i]:
+        op.input[i] = in_out_map[op.input[i]]
+    for i in range(len(op.output)):
+      if output_node not in op.output[i]:
+        op.output[i] = in_out_map[op.output[i]]
+
+def rename_tensor(net_def):
+  tensor_map = {}
+  for t in net_def.tensors:
+    if not tensor_map.has_key(t.name):
+      tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
+      t.name = tensor_map[t.name]
+  for op in net_def.op:
+    for i in range(len(op.input)):
+      if tensor_map.has_key(op.input[i]):
+        op.input[i] = tensor_map[op.input[i]]
+    for i in range(len(op.output)):
+      if tensor_map.has_key(op.output[i]):
+        op.output[i] = tensor_map[op.output[i]]
+
+class TensorInfo:
+  def __init__(self, id, t, runtime):
+    self.id = id
+    self.data_type = mace_pb2.DataType.Name(t.data_type)
+    if t.data_type == mace_pb2.DT_FLOAT:
+      if runtime == 'gpu':
+        self.data_type = mace_pb2.DT_HALF
+        self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes())
+      else:
+        self.data_type = mace_pb2.DT_FLOAT
+        self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes())
+    elif t.data_type == mace_pb2.DT_INT32:
+      self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes())
+    elif t.data_type == mace_pb2.DT_UINT8:
+      self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist())
+
+def stringfy(value):
+  return ', '.join('"{0}"'.format(w) for w in value)
+
+def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data):
+  if obfuscate:
+    obfuscate_name(net_def)
+  else:
+    rename_tensor(net_def)
+
+  # Capture our current directory
+  print template_dir
+
+  # Create the jinja2 environment.
+  j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True)
+  j2_env.filters['stringfy'] = stringfy
+  output_dir = os.path.dirname(output) + '/'
+  # generate tensor source files
+  template_name = 'tensor_source.template'
+  model_data = []
+  offset = 0
+  counter = 0
+  for t in net_def.tensors:
+    tensor_info = TensorInfo(counter, t, runtime)
+    # align
+    if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
+      padding = 4 - offset % 4
+      model_data.extend(bytearray([0] * padding))
+      offset += padding
+    source = j2_env.get_template(template_name).render(
+      tensor_info = tensor_info,
+      tensor = t,
+      tag = model_tag,
+      runtime = runtime,
+      offset = offset,
+    )
+    model_data.extend(tensor_info.data)
+    offset += len(tensor_info.data)
+    with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
+      f.write(source)
+    counter += 1
+
+  # generate tensor data
+  template_name = 'tensor_data.template'
+  source = j2_env.get_template(template_name).render(
+    tag = model_tag,
+    embed_model_data = embed_model_data,
+    model_data_size = offset,
+    model_data = model_data
+  )
+  with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
+    f.write(source)
+  if not embed_model_data:
+    f = open(output_dir + model_tag + '.data', "wb")
+    f.write(bytearray(model_data))
+    f.close()
+
+  # generate op source files
+  template_name = 'operator.template'
+  counter = 0
+  op_size = len(net_def.op)
+  for start in range(0, op_size, 10):
+    source = j2_env.get_template(template_name).render(
+      start = start,
+      end = min(start+10, op_size),
+      net = net_def,
+      tag = model_tag,
+      runtime = runtime,
+    )
+    with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
+      f.write(source)
+    counter += 1
+
+  # generate model source files
+  template_name = 'model.template'
+  tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))]
+  source = j2_env.get_template(template_name).render(
+    tensors = tensors,
+    net = net_def,
+    tag = model_tag,
+    runtime = runtime,
+    model_pb_checksum = mode_pb_checksum
+  )
+  with open(output, "wb") as f:
+    f.write(source)
+
+  # generate model header file
+  template_name = 'model_header.template'
+  source = j2_env.get_template(template_name).render(
+    tag = model_tag,
+  )
+  with open(output_dir + model_tag + '.h', "wb") as f:
+    f.write(source)
diff --git a/mace/python/tools/tensor_data.template b/mace/python/tools/tensor_data.template
new file mode 100644
index 0000000000000000000000000000000000000000..d009b101164e32cff237058239db7d902aca12e5
--- /dev/null
+++ b/mace/python/tools/tensor_data.template
@@ -0,0 +1,64 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+
+#include <vector>
+#include <string>
+
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+{% if not embed_model_data %}
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+{% endif %}
+
+namespace mace {
+namespace {{tag}} {
+
+{% if embed_model_data %}
+alignas(4) const unsigned char model_data[{{ model_data_size }}] = {
+{% for d in model_data %}{{"0x%02X, " % d }}{%endfor%}
+};
+{% endif %}
+
+const unsigned char *LoadModelData(const char *model_data_file) {
+{% if embed_model_data %}
+  return model_data;
+{% else %}
+  int fd = open(model_data_file, O_RDONLY);
+  MACE_CHECK(fd >= 0, "Failed to open model data file ",
+             model_data_file, ", error code: ", errno);
+
+  const unsigned char *model_data =
+    static_cast<const unsigned char *>(mmap(nullptr, {{ model_data_size }},
+                                       PROT_READ, MAP_PRIVATE, fd, 0));
+  MACE_CHECK(model_data != MAP_FAILED, "Failed to map model data file ",
+             model_data_file, ", error code: ", errno);
+
+  int ret = close(fd);
+  MACE_CHECK(ret == 0, "Failed to close model data file ",
+             model_data_file, ", error code: ", errno);
+
+  return model_data;
+{% endif %}
+}
+
+void UnloadModelData(const unsigned char *model_data) {
+{% if not embed_model_data %}
+  int ret = munmap(const_cast<unsigned char *>(model_data),
+                   {{ model_data_size }});
+  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", errno);
+{% endif %}
+}
+
+}  // namespace {{tag}}
+}  // namespace mace
+
diff --git a/mace/python/tools/tensor_source.template b/mace/python/tools/tensor_source.template
new file mode 100644
index 0000000000000000000000000000000000000000..c321112ed08fa74356f0b64abeb6887ea3025542
--- /dev/null
+++ b/mace/python/tools/tensor_source.template
@@ -0,0 +1,26 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+
+#include <vector>
+#include <string>
+
+#include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace {{tag}} {
+
+void CreateTensor{{tensor_info.id}}(std::vector<mace::ConstTensor> &tensors,
+                               const unsigned char *model_data) {
+  MACE_LATENCY_LOGGER(2, "Create tensor {{ tensor.name }}");
+  tensors.emplace_back(mace::ConstTensor(
+      {{ tensor.name|tojson }}, model_data + {{ offset }},
+      { {{ tensor.dims|join(', ') }} }, {{ tensor_info.data_type }}, {{ tensor.node_id }}));
+}
+
+}  // namespace {{tag}}
+}  // namespace mace
+
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b488b1e526ee2689d26b9678d9ac72b62564d8c
--- /dev/null
+++ b/mace/python/tools/tf_converter_lib.py
@@ -0,0 +1,1047 @@
+from mace.proto import mace_pb2
+import tensorflow as tf
+import numpy as np
+import math
+import copy
+from tensorflow import gfile
+from mace.python.tools import memory_optimizer
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import tensor_shape_pb2
+
+# TODO: support NCHW formt, now only support NHWC.
+padding_mode = {
+  'VALID': 0,
+  'SAME': 1,
+  'FULL': 2
+}
+pooling_type_mode = {
+  'AvgPool': 1,
+  'MaxPool': 2
+}
+
+buffer_type_map = {
+  'CONV2D_FILTER' : 0,
+  'IN_OUT_CHANNEL' : 1,
+  'ARGUMENT' : 2,
+  'IN_OUT_HEIGHT' : 3,
+  'IN_OUT_WIDTH' : 4,
+  'WINOGRAD_FILTER' : 5,
+  'DW_CONV2D_FILTER' : 6,
+}
+
+data_type_map = {
+  'DT_HALF' : mace_pb2.DT_HALF,
+  'DT_FLOAT': mace_pb2.DT_FLOAT
+}
+
+activation_name_map = {
+  'Relu' : 'RELU',
+  'Sigmoid' : 'SIGMOID',
+  'Tanh' : 'TANH',
+  'Relu6' : 'RELUX'
+}
+
+BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
+
+MACE_INPUT_NODE_NAME = "mace_input_node"
+MACE_OUTPUT_NODE_NAME = "mace_output_node"
+
+OPENCL_IMAGE_MAX_SIZE = 16384
+
+def get_input_tensor(op, index):
+  input_tensor = op.inputs[index]
+  if input_tensor.op.type == 'Reshape':
+    input_tensor = get_input_tensor(input_tensor.op, 0)
+  return input_tensor
+
+class TFConverter(object):
+  def __init__(self, tf_ops, net_def, dt, device, winograd):
+    self.net_def = net_def
+    self.tf_ops = tf_ops
+    self.dt = dt
+    self.device = device
+    self.winograd = winograd
+    self.tf_graph = {}
+    self.tf_parents = {}
+    self.resolved_ops = {}
+    self.unused_tensor = set()
+    self.transpose_filter_tensor = {}
+    self.reshape_tensor = {}
+    self.ops = {}
+
+    for op in tf_ops:
+      self.ops[op.name] = op
+
+    for op in tf_ops:
+      self.resolved_ops[op.name] = 0
+      for input in op.inputs:
+        input_name = input.name[:-2]
+        if input_name not in self.tf_graph:
+          self.tf_graph[input_name] = []
+        self.tf_graph[input_name].append(op)
+        if op.name not in self.tf_parents:
+          self.tf_parents[op.name] = []
+        self.tf_parents[op.name].append(self.ops[input_name])
+
+  def add_buffer_to_image(self, input_name, input_type):
+    output_name = input_name[:-2] + "_b2i" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'BufferToImage'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'mode'
+    arg.i = 0
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+  def add_image_to_buffer(self, input_name, input_type):
+    output_name = input_name[:-2] + "_i2b" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'ImageToBuffer'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+  def add_input_transform(self, names, is_single):
+    for name in names:
+      if is_single:
+        new_input_name = MACE_INPUT_NODE_NAME + ":0"
+      else:
+        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = name
+      op_def.type = 'BufferToImage'
+      op_def.input.extend([new_input_name])
+      op_def.output.extend([name+':0'])
+
+      epsilon_arg = op_def.arg.add()
+      epsilon_arg.name = 'buffer_type'
+      epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
+
+      arg = op_def.arg.add()
+      arg.name = 'T'
+      arg.i = self.dt
+
+  def add_output_transform(self, names, is_single):
+    for name in names:
+      if is_single:
+        output_name = MACE_OUTPUT_NODE_NAME + ":0"
+      else:
+        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = output_name[:-2]
+      op_def.type = 'ImageToBuffer'
+      op_def.input.extend([name+':0'])
+      op_def.output.extend([output_name])
+
+      epsilon_arg = op_def.arg.add()
+      epsilon_arg.name = 'buffer_type'
+      epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
+
+  @staticmethod
+  def add_output_shape(outputs, op):
+    output_shapes = []
+    for output in outputs:
+      if output.shape.num_elements() is not None:
+        output_shape = mace_pb2.OutputShape()
+        output_shape.dims.extend(output.shape.as_list())
+        output_shapes.append(output_shape)
+    op.output_shape.extend(output_shapes)
+
+  def add_tensor(self, name, shape, tf_dt, value):
+    tensor = self.net_def.tensors.add()
+    tensor.name = name
+
+    shape = list(shape)
+    tensor.dims.extend(shape)
+
+    if tf_dt == tf.float32:
+      tensor.data_type = mace_pb2.DT_FLOAT
+      tensor.float_data.extend(value.flat)
+    elif tf_dt == tf.int32:
+      tensor.data_type = mace_pb2.DT_INT32
+      tensor.int32_data.extend(value.flat)
+    else:
+      raise Exception("Not supported tensor type: " + tf_dt.name)
+
+  def convert_reshape(self, op):
+    input_tensor = get_input_tensor(op, 0)
+    shape_tensor = get_input_tensor(op, 1)
+    shape_value = shape_tensor.eval().astype(np.int32)
+    self.unused_tensor.add(shape_tensor.name)
+    self.reshape_tensor[input_tensor.name] = shape_value
+    self.resolved_ops[op.name] = 1
+
+  def convert_tensor(self, op):
+    output_name = op.outputs[0].name
+    if output_name not in self.unused_tensor:
+      tensor = self.net_def.tensors.add()
+      tf_tensor = op.outputs[0].eval()
+      if output_name in self.transpose_filter_tensor:
+        tf_tensor = tf_tensor.transpose(self.transpose_filter_tensor[output_name])
+      if output_name in self.reshape_tensor:
+        tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name])
+      tensor.name = op.outputs[0].name
+
+      shape = list(tf_tensor.shape)
+      tensor.dims.extend(shape)
+
+      tf_dt = op.get_attr('dtype')
+      if tf_dt == tf.float32:
+        tensor.data_type = mace_pb2.DT_FLOAT
+        tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
+      elif tf_dt == tf.int32:
+        tensor.data_type = mace_pb2.DT_INT32
+        tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
+      else:
+        raise Exception("Not supported tensor type: " + tf_dt.name)
+    self.resolved_ops[op.name] = 1
+
+  def check_winograd_conv(self, op):
+    filter_shape = get_input_tensor(op, 1).shape.as_list()
+    strides = op.get_attr('strides')[1:3]
+    output_shape = op.outputs[0].shape.as_list()
+    if len(output_shape) == 0 or output_shape[0] is None:
+      return False
+    width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+    return self.winograd and op.type != 'DepthwiseConv2dNative' and self.device == 'gpu' and \
+            filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
+            (strides[0] == 1) and (strides[0] == strides[1]) and \
+            (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
+            (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
+            (width < OPENCL_IMAGE_MAX_SIZE)
+
+  def convert_winograd_conv(self, op):
+    filter_tensor = get_input_tensor(op, 1)
+    filter_shape = filter_tensor.shape.as_list()
+    output_shape = op.outputs[0].shape.as_list()
+
+    self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1)
+    filter_name = self.add_buffer_to_image(op.inputs[1].name, "WINOGRAD_FILTER")
+
+    # Input transform
+    wt_op = mace_pb2.OperatorDef()
+    arg = wt_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    padding_arg = wt_op.arg.add()
+    padding_arg.name = 'padding'
+    padding_arg.i = padding_mode[op.get_attr('padding')]
+    wt_op.name = op.name + '_input_transform'
+    wt_op.type = 'WinogradTransform'
+    wt_op.input.extend([op.inputs[0].name])
+    wt_output_name = wt_op.name + ":0"
+    wt_op.output.extend([wt_output_name])
+    wt_output_shape = mace_pb2.OutputShape()
+    wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+    wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    wt_op.output_shape.extend([wt_output_shape])
+
+    # MatMul
+    matmul_op = mace_pb2.OperatorDef()
+    arg = matmul_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    matmul_op.name = op.name + '_matmul'
+    matmul_op.type = 'MatMul'
+    matmul_op.input.extend([filter_name, wt_output_name])
+    matmul_output_name = matmul_op.name + ":0"
+    matmul_op.output.extend([matmul_output_name])
+    matmul_output_shape = mace_pb2.OutputShape()
+    matmul_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    matmul_op.output_shape.extend([matmul_output_shape])
+
+    # Inverse transform
+    iwt_op = mace_pb2.OperatorDef()
+    arg = iwt_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    batch_arg = iwt_op.arg.add()
+    batch_arg.name = 'batch'
+    batch_arg.i = output_shape[0]
+    height_arg = iwt_op.arg.add()
+    height_arg.name = 'height'
+    height_arg.i = output_shape[1]
+    width_arg = iwt_op.arg.add()
+    width_arg.name = 'width'
+    width_arg.i = output_shape[2]
+    iwt_op.name = op.name + '_inverse_transform'
+    iwt_op.type = 'WinogradInverseTransform'
+    iwt_op.input.extend([matmul_output_name])
+
+    final_op = op
+    self.resolved_ops[op.name] = 1
+
+    if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' :
+      bias_add_op = self.tf_graph[op.name][0]
+      output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
+      iwt_op.input.extend([output_name])
+      final_op = bias_add_op
+      self.resolved_ops[bias_add_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+        and self.tf_graph[final_op.name][0].type in activation_name_map:
+      activation_op = self.tf_graph[final_op.name][0]
+      fused_act_arg = iwt_op.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      if activation_op.type == 'Relu6':
+        max_limit_arg = iwt_op.arg.add()
+        max_limit_arg.name = 'max_limit'
+        max_limit_arg.f = 6
+      final_op = activation_op
+      self.resolved_ops[activation_op.name] = 1
+
+    iwt_op.output.extend([output.name for output in final_op.outputs])
+    self.add_output_shape(final_op.outputs, iwt_op)
+    self.net_def.op.extend([wt_op, matmul_op, iwt_op])
+
+
+  def convert_conv2d(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    if op.type == 'DepthwiseConv2dNative':
+      op_def.type = 'DepthwiseConv2d'
+    else:
+      op_def.type = op.type
+      self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2)
+    if self.device == 'gpu':
+      op_def.input.extend([op.inputs[0].name])
+      buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER"
+      output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, buffer_type)
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([get_input_tensor(op, i).name for i in range(len(op.inputs))])
+
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_arg.i = padding_mode[op.get_attr('padding')]
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend(op.get_attr('strides')[1:3])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    final_op = op
+    self.resolved_ops[op.name] = 1
+
+    if len(self.tf_graph.get(op.name, [])) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd':
+      bias_add_op = self.tf_graph[op.name][0]
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
+      final_op = bias_add_op
+      self.resolved_ops[bias_add_op.name] = 1
+
+    if len(self.tf_graph.get(final_op.name, [])) == 1 \
+        and self.tf_graph[final_op.name][0].type in activation_name_map:
+      activation_op = self.tf_graph[final_op.name][0]
+      op_def.type = "FusedConv2D"
+      fused_act_arg = op_def.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      if activation_op.type == 'Relu6':
+        max_limit_arg = op_def.arg.add()
+        max_limit_arg.name = 'max_limit'
+        max_limit_arg.f = 6
+      final_op = activation_op
+      self.resolved_ops[activation_op.name] = 1
+
+    op_def.output.extend([output.name for output in final_op.outputs])
+    self.add_output_shape(final_op.outputs, op_def)
+    self.net_def.op.extend([op_def])
+
+  def convert_fused_batchnorm(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    op_def.name = op.name
+    op_def.type = 'FoldedBatchNorm'
+
+    gamma_tensor = get_input_tensor(op, 1)
+    for i in range(1, 5):
+      input_tensor = get_input_tensor(op, i)
+      assert input_tensor.shape == gamma_tensor.shape
+      self.unused_tensor.add(input_tensor.name)
+
+    gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
+    beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
+    mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
+    var_value = get_input_tensor(op, 4).eval().astype(np.float32)
+    epsilon_value = op.get_attr('epsilon')
+
+    scale_value = (
+      (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
+      gamma_value)
+    offset_value = (-mean_value * scale_value) + beta_value
+    idx = gamma_tensor.name.rfind('/')
+    name_prefix = gamma_tensor.name[:idx] + '/'
+    input_names = [name_prefix+'scale:0', name_prefix+'offset:0']
+    self.add_tensor(input_names[0], gamma_value.shape,
+      gamma_tensor.dtype, scale_value)
+    self.add_tensor(input_names[1], gamma_value.shape,
+      gamma_tensor.dtype, offset_value)
+
+    op_def.input.extend([op.inputs[0].name])
+    if self.device == 'gpu':
+      for name in input_names:
+        output_name = self.add_buffer_to_image(name, "ARGUMENT")
+        op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([name for name in input_names])
+
+    self.resolved_ops[op.name] = 1
+    final_op = op
+
+    if len(self.tf_graph[op.name]) == 1 \
+        and self.tf_graph[op.name][0].type in activation_name_map:
+      activation_op = self.tf_graph[op.name][0]
+      fused_act_arg = op_def.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      if activation_op.type == 'Relu6':
+        max_limit_arg = op_def.arg.add()
+        max_limit_arg.name = 'max_limit'
+        max_limit_arg.f = 6
+      final_op = activation_op
+      self.resolved_ops[activation_op.name] = 1
+
+    op_def.output.extend([final_op.outputs[0].name])
+    self.add_output_shape([final_op.outputs[0]], op_def)
+
+    self.net_def.op.extend([op_def])
+
+  def convert_batchnorm(self, op):
+    bn_ops = []
+    bn_ops.append(op)
+    for i in range(1, 3):
+      if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
+          and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
+        bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
+      else:
+        raise Exception('Invalid BatchNorm Op')
+    if len(self.tf_graph[bn_ops[2].name]) == 2 \
+        and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
+        and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
+      bn_ops.append(self.tf_graph[bn_ops[2].name][0])
+      bn_ops.append(self.tf_graph[bn_ops[2].name][1])
+    else:
+      raise Exception('Invalid BatchNorm Op')
+    bn_ops.append(self.tf_graph[bn_ops[4].name][0])
+    bn_ops.append(self.tf_graph[bn_ops[3].name][0])
+
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+
+    input_name = get_input_tensor(bn_ops[3], 0).name
+    gamma = get_input_tensor(bn_ops[2], 1).name
+    beta = get_input_tensor(bn_ops[5], 0).name
+    mean = get_input_tensor(bn_ops[4], 0).name
+    variance = get_input_tensor(bn_ops[0], 0).name
+
+    op_def.name = op.name[:-4]  # remove /add
+    op_def.type = 'BatchNorm'
+    if self.device == 'gpu':
+      op_def.input.extend([input_name])
+      for tensor_name in [gamma, beta, mean, variance]:
+        output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
+        op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([input_name, gamma, beta, mean, variance])
+    op_def.output.extend([output.name for output in bn_ops[6].outputs])
+    self.add_output_shape(bn_ops[6].outputs, op_def)
+    epsilon_arg = op_def.arg.add()
+    epsilon_arg.name = 'epsilon'
+    epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+
+    self.net_def.op.extend([op_def])
+    for i in range(0, 7):
+      self.resolved_ops[bn_ops[i].name] = 1
+
+  def convert_pooling(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = 'Pooling'
+    op_def.input.extend([input.name for input in op.inputs])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    pooling_type_arg = op_def.arg.add()
+    pooling_type_arg.name = 'pooling_type'
+    pooling_type_arg.i = pooling_type_mode[op.type]
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_arg.i = padding_mode[op.get_attr('padding')]
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend(op.get_attr('strides')[1:3])
+    kernels_arg = op_def.arg.add()
+    kernels_arg.name = 'kernels'
+    kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    self.resolved_ops[op.name] = 1
+
+  def convert_global_avg_pooling(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = 'Pooling'
+    op_def.input.extend([op.inputs[0].name])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    pooling_type_arg = op_def.arg.add()
+    pooling_type_arg.name = 'pooling_type'
+    pooling_type_arg.i = pooling_type_mode['AvgPool']
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_arg.i = padding_mode['VALID']
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend([1, 1])
+    kernels_arg = op_def.arg.add()
+    kernels_arg.name = 'kernels'
+    kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    self.resolved_ops[op.name] = 1
+
+  def convert_activation(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = 'Activation'
+    activation_arg = op_def.arg.add()
+    activation_arg.name = 'activation'
+    activation_arg.s = activation_name_map[op.type]
+    op_def.input.extend([input.name for input in op.inputs])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+
+  def convert_relu6(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = 'Activation'
+    op_def.input.extend([input.name for input in op.inputs])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    activation_arg = op_def.arg.add()
+    activation_arg.name = 'activation'
+    activation_arg.s = "RELUX"
+    max_limit_arg = op_def.arg.add()
+    max_limit_arg.name = 'max_limit'
+    max_limit_arg.f = 6
+    self.resolved_ops[op.name] = 1
+
+  def convert_add(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = "AddN"
+    op_def.input.extend([input.name for input in op.inputs])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+
+  def convert_concat(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = "Concat"
+    op_def.input.extend([input.name for input in op.inputs[:-1]])
+    op_def.output.extend([output.name for output in op.outputs])
+    axis_arg = op_def.arg.add()
+    axis_arg.name = 'axis'
+    axis_arg.i = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
+
+  def convert_resize_bilinear(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = "ResizeBilinear"
+    op_def.input.extend([op.inputs[0].name])
+    op_def.output.extend([output.name for output in op.outputs])
+    size_arg = op_def.arg.add()
+    size_arg.name = 'size'
+    size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
+    size_arg = op_def.arg.add()
+    size_arg.name = 'align_corners'
+    size_arg.i = op.get_attr('align_corners')
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+
+  def convert_bias_add(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = "BiasAdd"
+    op_def.input.extend([op.inputs[0].name])
+    if self.device == 'gpu':
+      output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, "ARGUMENT")
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([get_input_tensor(op, 1).name])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    self.net_def.op.extend([op_def])
+    self.resolved_ops[op.name] = 1
+
+  def convert_space_to_batch(self, op, b2s):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = op.type
+    op_def.input.extend([op.inputs[0].name])
+    op_def.output.extend([output.name for output in op.outputs])
+    size_arg = op_def.arg.add()
+    size_arg.name = 'block_shape'
+    size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
+    size_arg = op_def.arg.add()
+    if b2s:
+      size_arg.name = 'crops'
+    else:
+      size_arg.name = 'paddings'
+    size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+  def is_atrous_conv2d(self, op):
+    return op.type == 'SpaceToBatchND' and\
+           len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
+
+  def convert_atrous_conv2d(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    conv_op = self.tf_graph[op.name][0]
+    op_def.name = conv_op.name
+    op_def.type = conv_op.type
+    self.transpose_filter_tensor[get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2)
+    if self.device == 'gpu':
+      op_def.input.extend([op.inputs[0].name])
+      output_name = self.add_buffer_to_image(get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([get_input_tensor(op, 0).name])
+      op_def.input.extend([get_input_tensor(conv_op, 1).name])
+
+    dilation_arg = op_def.arg.add()
+    dilation_arg.name = 'dilations'
+    dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
+    if len(padding_values) > 0 and padding_values[0] > 0:
+      padding_arg.i = padding_mode['SAME']
+    else:
+      padding_arg.i = padding_mode['VALID']
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend([1, 1])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    final_op = conv_op
+    self.resolved_ops[op.name] = 1
+    self.resolved_ops[conv_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
+      bias_add_op = self.tf_graph[final_op.name][0]
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
+      final_op = bias_add_op
+      self.resolved_ops[bias_add_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+      and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
+      final_op = self.tf_graph[final_op.name][0]
+      self.resolved_ops[final_op.name] = 1
+      self.unused_tensor.add(get_input_tensor(final_op, 1).name)
+      self.unused_tensor.add(get_input_tensor(final_op, 2).name)
+    else:
+      raise Exception('Convert atrous conv error: no BatchToSpaceND op')
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+        and self.tf_graph[final_op.name][0].type == 'Relu':
+      relu_op = self.tf_graph[final_op.name][0]
+      op_def.type = "FusedConv2D"
+      fused_relu_arg = op_def.arg.add()
+      fused_relu_arg.name = 'activation'
+      fused_relu_arg.s = "RELU"
+      final_op = relu_op
+      self.resolved_ops[relu_op.name] = 1
+
+    op_def.output.extend([output.name for output in final_op.outputs])
+    self.add_output_shape(final_op.outputs, op_def)
+    self.net_def.op.extend([op_def])
+
+  def is_softmax(self, op):
+    return op.type == 'Softmax' and \
+           len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \
+           len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape'
+
+  def convert_softmax(self, softmax_op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+
+    # deal with first Reshape op
+    parent_reshape_op = self.tf_parents[softmax_op.name][0]
+    self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
+    self.resolved_ops[parent_reshape_op.name] = 1
+
+    # FIXME: hardcode for inception_v3
+    # remove squeeze if exist
+    squeeze_op = self.tf_parents[parent_reshape_op.name][0]
+    if squeeze_op.type == 'Squeeze':
+      op_def.input.extend([squeeze_op.inputs[0].name])
+      self.resolved_ops[squeeze_op.name] = 1
+      # remove shape if exist
+      children_ops = self.tf_graph[squeeze_op.name]
+      print children_ops
+      if len(children_ops) > 1 and children_ops[0].type == 'Shape':
+        self.unused_tensor.add(get_input_tensor(children_ops[1], 0).name)
+        self.resolved_ops[children_ops[1].name] = 1
+    else:
+      op_def.input.extend([parent_reshape_op.inputs[0].name])
+
+    # deal with Softmax op
+    op_def.name = softmax_op.name
+    op_def.type = softmax_op.type
+    self.resolved_ops[softmax_op.name] = 1
+
+    # deal with last Reshape op
+    reshape_op = self.tf_graph[softmax_op.name][0]
+    self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
+
+    if reshape_op.outputs[0].shape.ndims == 2:
+      shape = reshape_op.outputs[0].shape
+      from tensorflow.python.framework.tensor_shape import as_shape
+      reshape_op.outputs[0]._shape = as_shape([1, 1, shape[0], shape[1]])
+    op_def.output.extend([output.name for output in reshape_op.outputs])
+    self.add_output_shape(reshape_op.outputs, op_def)
+    self.resolved_ops[reshape_op.name] = 1
+
+  def convert_normal_op(self, op):
+    op_def = self.net_def.op.add()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    op_def.name = op.name
+    op_def.type = op.type
+    op_def.input.extend([input.name for input in op.inputs])
+    op_def.output.extend([output.name for output in op.outputs])
+    self.add_output_shape(op.outputs, op_def)
+    self.resolved_ops[op.name] = 1
+
+  def replace_in_out_name(self, input_names, output_names, is_single):
+    in_names = set([input_name + ":0" for input_name in input_names])
+    out_names = set([output_name + ":0" for output_name in output_names])
+    if is_single:
+      for op in self.net_def.op:
+        if len(op.input) > 0 and op.input[0] in in_names:
+          op.input[0] = MACE_INPUT_NODE_NAME + ':0'
+        if len(op.output) > 0 and op.output[0] in out_names:
+          op.output[0] = MACE_OUTPUT_NODE_NAME + ':0'
+    else:
+      for op in self.net_def.op:
+        if len(op.input) > 0 and op.input[0] in in_names:
+          op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
+        if len(op.output) > 0 and op.output[0] in out_names:
+          op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
+
+  def convert(self, input_nodes, output_nodes):
+    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
+    if self.device == 'gpu':
+      self.add_input_transform(input_nodes, is_single)
+
+    for op in self.tf_ops:
+      if self.resolved_ops[op.name] == 1:
+        continue
+      if op.type in ['Placeholder', 'Identity']:
+        self.resolved_ops[op.name] = 1
+        pass
+      elif op.type == 'Const':
+        pass
+      elif op.type == 'Reshape':
+        self.convert_reshape(op)
+      elif self.is_atrous_conv2d(op):
+        self.convert_atrous_conv2d(op)
+      elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
+        if self.check_winograd_conv(op):
+          self.convert_winograd_conv(op)
+        else:
+          self.convert_conv2d(op)
+      elif op.type == 'FusedBatchNorm':
+        self.convert_fused_batchnorm(op)
+      elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
+        self.convert_batchnorm(op)
+      elif op.type == 'AvgPool' or op.type == 'MaxPool':
+        self.convert_pooling(op)
+      elif op.type == 'Relu6':
+        self.convert_relu6(op)
+      elif op.type == 'Add':
+        self.convert_add(op)
+      elif op.type == 'ConcatV2':
+        self.convert_concat(op)
+      elif op.type == 'ResizeBilinear':
+        self.convert_resize_bilinear(op)
+      elif op.type == 'BiasAdd':
+        self.convert_bias_add(op)
+      elif op.type == 'SpaceToBatchND':
+        self.convert_space_to_batch(op, False)
+      elif op.type == 'BatchToSpaceND':
+        self.convert_space_to_batch(op, True)
+      elif self.is_softmax(op):
+        self.convert_softmax(op)
+      elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
+        self.convert_activation(op)
+      # FIXME: hardcode for inception_v3
+      elif op.type in ['Squeeze', 'Shape']:
+        self.resolved_ops[op.name] = 1
+      elif op.type == 'Mean':
+        # Global avg pooling
+        reduce_dims = op.inputs[1].eval()
+        if reduce_dims[0] == 1 and reduce_dims[1] == 2:
+          self.convert_global_avg_pooling(op)
+          self.unused_tensor.add(op.inputs[1].name)
+        else:
+          raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+      #elif op.type in ['']:
+      #  self.convert_normal_op(op)
+      else:
+        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+
+    for op in self.tf_ops:
+      if self.resolved_ops[op.name] == 1:
+        continue
+      elif op.type == 'Const':
+        self.convert_tensor(op)
+      else:
+        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+
+    if self.device == 'gpu':
+      self.add_output_transform(output_nodes, is_single)
+
+    if self.device == 'cpu':
+      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+
+    for key in self.resolved_ops:
+      if self.resolved_ops[key] != 1:
+        print 'Unresolve Op: %s' % key
+
+class Optimizer:
+  def __init__(self, net_def, device):
+    self.net_def = net_def
+    self.device = device
+    self.mace_graph = {}
+    self.tensor_map = {}
+    for op in net_def.op:
+      for input_name in op.input:
+        if input_name not in self.mace_graph:
+          self.mace_graph[input_name] = []
+        self.mace_graph[input_name].append(op)
+
+    for tensor in net_def.tensors:
+      self.tensor_map[tensor.name] = tensor
+
+  def get_buffer_tensor_name(self, name):
+    if self.device == 'gpu':
+      return name[:-6] + name[-2:]
+    else:
+      return name
+
+  def fold_batch_norm(self):
+    unused_tensors = set()
+    new_tensors = []
+    new_net = mace_pb2.NetDef()
+    resolved_ops = set()
+
+    for op in self.net_def.op:
+      if op.name in resolved_ops:
+        pass
+      elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 \
+          and self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
+        depthwise_conv2d_op = op
+        folded_bn_op = self.mace_graph[op.output[0]][0]
+        weight_buffer_name = self.get_buffer_tensor_name(depthwise_conv2d_op.input[1])
+        weight_tensor = self.tensor_map[weight_buffer_name]
+        scale_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[1])
+        offset_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[2])
+        scale_tensor = self.tensor_map[scale_buffer_name]
+        weight_shape = weight_tensor.dims
+        idx = 0
+        for i in range(weight_shape[0]):
+          for j in range(weight_shape[1]):
+            for ic in range(weight_shape[2]):
+              for oc in range(weight_shape[3]):
+                weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc]
+                idx += 1
+
+        new_tensors.append(weight_tensor)
+        unused_tensors.add(weight_tensor.name)
+        unused_tensors.add(scale_tensor.name)
+
+        if self.device == 'gpu':
+          scale_b2i_op = self.mace_graph[scale_buffer_name][0]
+          offset_b2i_op = self.mace_graph[offset_buffer_name][0]
+          resolved_ops.add(scale_b2i_op.name)
+          resolved_ops.add(offset_b2i_op.name)
+          new_net.op.extend([offset_b2i_op])
+
+        resolved_ops.add(depthwise_conv2d_op.name)
+        resolved_ops.add(folded_bn_op.name)
+
+        offset_tensor_name = folded_bn_op.input[2]
+        depthwise_conv2d_op.input.extend([offset_tensor_name])
+
+        for arg in folded_bn_op.arg:
+          if arg.name == 'activation':
+            act_arg = depthwise_conv2d_op.arg.add()
+            act_arg.name = arg.name
+            act_arg.s = arg.s
+          elif arg.name == 'max_limit':
+            act_arg = depthwise_conv2d_op.arg.add()
+            act_arg.name = arg.name
+            act_arg.f = arg.f
+
+        depthwise_conv2d_op.output[0] = folded_bn_op.output[0]
+        new_net.op.extend([depthwise_conv2d_op])
+      else:
+        new_net.op.extend([op])
+
+    for tensor in self.net_def.tensors:
+      if tensor.name in unused_tensors:
+        pass
+      else:
+        new_net.tensors.extend([tensor])
+
+    for tensor in new_tensors:
+      new_net.tensors.extend([tensor])
+
+    return new_net
+
+  def optimize(self):
+    new_net = self.fold_batch_norm()
+    return new_net
+
+def add_shape_info(input_graph_def, input_nodes, input_shapes):
+  inputs_replaced_graph = graph_pb2.GraphDef()
+  for node in input_graph_def.node:
+    if node.name in input_nodes:
+      idx = input_nodes.index(node.name)
+      input_shape = input_shapes[idx]
+      placeholder_node = copy.deepcopy(node)
+      placeholder_node.attr.clear()
+      placeholder_node.attr['shape'].shape.dim.extend([
+        tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in input_shape
+      ])
+      placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype'])
+      inputs_replaced_graph.node.extend([placeholder_node])
+    else:
+      inputs_replaced_graph.node.extend([copy.deepcopy(node)])
+  return inputs_replaced_graph
+
+
+def convert_to_mace_pb(model_file, input_node, input_shape, output_node, data_type, device, winograd):
+  net_def = mace_pb2.NetDef()
+  dt = data_type_map[data_type]
+
+  input_graph_def = tf.GraphDef()
+  with gfile.Open(model_file, "rb") as f:
+    data = f.read()
+    input_graph_def.ParseFromString(data)
+
+  input_nodes = [x for x in input_node.split(',')]
+  input_shapes = []
+  if input_shape != "":
+    input_shape_strs = [x for x in input_shape.split(':')]
+    for shape_str in input_shape_strs:
+      input_shapes.extend([[int(x) for x in shape_str.split(',')]])
+  output_nodes = [x for x in output_node.split(',')]
+  assert len(input_nodes) == len(input_shapes)
+
+  input_graph_def = add_shape_info(input_graph_def, input_nodes, input_shapes)
+  with tf.Session() as session:
+    with session.graph.as_default() as graph:
+      tf.import_graph_def(input_graph_def, name="")
+      ops = graph.get_operations()
+      converter = TFConverter(ops, net_def, dt, device, winograd)
+      converter.convert(input_nodes, output_nodes)
+      optimizer = Optimizer(net_def, device)
+      net_def = optimizer.optimize()
+      print "Model Converted."
+      if device == 'gpu':
+        print "start optimize memory."
+        mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
+        mem_optimizer.optimize()
+        print "Memory optimization done."
+
+  return net_def
diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..f53c25aa29753593ef21d670a5325a72403347da
--- /dev/null
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -0,0 +1,408 @@
+from mace.proto import mace_pb2
+import tensorflow as tf
+from tensorflow import gfile
+from operator import mul
+from dsp_ops import DspOps
+from mace.python.tools import graph_util
+from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
+
+# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \
+# --runtime dsp --input_node input_node --output_node output_node
+
+padding_mode = {
+  'NA': 0,
+  'SAME': 1,
+  'VALID': 2,
+  'MIRROR_REFLECT': 3,
+  'MIRROR_SYMMETRIC': 4,
+  'SAME_CAFFE': 5
+}
+
+def get_tensor_name_from_op(op_name, port):
+  return op_name + ':' + str(port)
+
+def get_node_from_map(op_map, op_or_tensor_name):
+  op_name = op_or_tensor_name.split(':')[0]
+  return op_map[op_name]
+
+def get_op_and_port_from_tensor(tensor_name):
+  op, port = tensor_name.split(':')
+  port = int(port)
+  return op, port
+
+def max_elem_size(tensor):
+  if len(tensor.shape.as_list()) == 0:
+    return tensor.dtype.size
+  else:
+    return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
+
+def find_dtype(tensor_dtype):
+  if tensor_dtype == tf.float32:
+    return mace_pb2.DT_FLOAT
+  elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
+    return mace_pb2.DT_UINT8
+  elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
+    return mace_pb2.DT_INT32
+  else:
+    raise Exception('Unsupported data type: ', tensor_dtype)
+
+def has_padding_and_strides(op):
+  return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
+
+def is_node_flatten_reshape(op):
+  return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
+
+def get_input_tensor(op, index):
+  input_tensor = op.inputs[index]
+  if input_tensor.op.type == 'Reshape':
+    input_tensor = get_input_tensor(input_tensor.op, 0)
+  return input_tensor
+
+def add_shape_const_node(net_def, op, values, name):
+  print ('Add const node: ', op.name + '/' + name)
+  tensor = net_def.tensors.add()
+  node_name = op.name + '/' + name
+  tensor.name = node_name + ':0'
+  tensor.data_type =  mace_pb2.DT_INT32
+  tensor.dims.extend(values)
+  return tensor.name
+
+
+def convert_op_outputs(mace_op_def, tf_op):
+  mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
+                                  for output in tf_op.outputs])
+  output_shapes = []
+  for output in tf_op.outputs:
+    output_shape = mace_pb2.OutputShape()
+    output_shape.dims.extend(output.shape.as_list())
+    output_shapes.append(output_shape)
+  mace_op_def.output_shape.extend(output_shapes)
+
+
+def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
+  first_op = unresolved_ops[0]
+  print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
+
+  if first_op.name in resolved_ops:
+    pass
+
+  elif first_op.type == 'Const':
+    print ('Add const node: ', first_op.name)
+    tf_tensor = first_op.outputs[0].eval()
+    tensor = net_def.tensors.add()
+    tensor.name = first_op.outputs[0].name
+    tensor.data_type = find_dtype(first_op.outputs[0].dtype)
+    shape = list(tf_tensor.shape)
+    if len(shape) > 0:
+      tensor.dims.extend(shape)
+    if first_op.outputs[0].dtype == tf.float32:
+      tensor.float_data.extend(tf_tensor.astype(float).flat)
+    elif first_op.outputs[0].dtype == tf.int32 or \
+            first_op.outputs[0].dtype == tf.int8 or \
+            first_op.outputs[0].dtype == tf.int16 or \
+            first_op.outputs[0].dtype == tf.quint8 or \
+            first_op.outputs[0].dtype == tf.quint16:
+      tensor.int32_data.extend(tf_tensor.astype(int).flat)
+
+  else:
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = dsp_ops.map_nn_op(first_op.type)
+    op_def.padding = padding_mode['NA']
+
+    if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
+        and len(first_op.outputs[0].consumers()) > 0 \
+        and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
+        or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
+      input_tensor = first_op.inputs[0]
+      min_tensor = first_op.inputs[1]
+      max_tensor = first_op.inputs[2]
+      s2b_op = first_op.outputs[0].consumers()[0]
+      reshape_op = s2b_op.outputs[0].consumers()[0]
+      min_op = reshape_op.outputs[0].consumers()[0]
+      max_op = reshape_op.outputs[0].consumers()[1]
+      quantize_op = min_op.outputs[0].consumers()[0]
+      resolved_ops.add(s2b_op.name)
+      resolved_ops.add(reshape_op.name)
+      resolved_ops.add(min_op.name)
+      resolved_ops.add(max_op.name)
+      resolved_ops.add(quantize_op.name)
+
+      op_def.name = quantize_op.name
+      op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
+      op_def.input.append(input_tensor.name)
+      op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
+      op_def.input.extend([min_tensor.name, max_tensor.name])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
+      convert_op_outputs(op_def, quantize_op)
+    elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \
+        and len(first_op.outputs[0].consumers()) > 0 \
+        and first_op.outputs[0].consumers()[0].type == 'Dequantize' \
+        and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \
+        and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax':
+      input_tensor = first_op.inputs[0]
+      min_tensor = first_op.inputs[2]
+      max_tensor = first_op.inputs[3]
+      dequantize_op = first_op.outputs[0].consumers()[0]
+      softmax_op = dequantize_op.outputs[0].consumers()[0]
+      reshape_op = softmax_op.outputs[0].consumers()[0]
+      min_op = reshape_op.outputs[0].consumers()[0]
+      max_op = reshape_op.outputs[0].consumers()[1]
+      quantize_op = min_op.outputs[0].consumers()[0]
+      quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
+
+      resolved_ops.add(dequantize_op.name)
+      resolved_ops.add(softmax_op.name)
+      resolved_ops.add(reshape_op.name)
+      resolved_ops.add(min_op.name)
+      resolved_ops.add(max_op.name)
+      resolved_ops.add(quantize_op.name)
+      resolved_ops.add(quantize_reshape_op.name)
+
+      op_def.name = quantize_reshape_op.name
+      op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
+      op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs])
+      convert_op_outputs(op_def, quantize_reshape_op)
+    elif has_padding_and_strides(first_op):
+      op_def.padding = padding_mode[first_op.get_attr('padding')]
+      op_def.input.extend([t.name for t in first_op.inputs])
+      if 'ksize' in first_op.node_def.attr:
+        ksize = first_op.get_attr('ksize')
+        ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
+        op_def.input.extend([ksize_tensor])
+      strides = first_op.get_attr('strides')
+      strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
+      op_def.input.extend([strides_tensor])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    elif is_node_flatten_reshape(first_op):
+      op_def.type = 'Flatten'
+      op_def.input.extend([t.name for t in first_op.inputs])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    elif dsp_ops.has_op(first_op.type):
+      op_def.input.extend([t.name for t in first_op.inputs])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    else:
+      raise Exception('Unsupported op: ', first_op)
+
+    resolved_ops.add(first_op.name)
+
+  del unresolved_ops[0]
+
+def add_output_node(net_def, output_node):
+  op_def = net_def.op.add()
+  op_def.name = '__output__'
+  op_def.type = 'OUTPUT'
+  op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
+
+def reverse_batch_to_space_and_biasadd(net_def):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+
+  new_ops = []
+  skip_ops = set()
+  visited_ops = set()
+
+  for op in net_def.op:
+    if op.name in visited_ops:
+      pass
+    # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
+    success = False
+    if op.type == 'Requantize_32to8':
+      biasadd_requantize_op = op
+      biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0])
+      if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
+        b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
+        if b2s_op.type == 'QuantizedBatchToSpaceND_8':
+          conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0])
+          conv_op = get_node_from_map(op_map, conv_requantize_op.input[0])
+          if conv_op.type == 'QuantizedConv2d_8x8to32':
+            new_biasadd_op = mace_pb2.OperatorDef()
+            new_biasadd_op.CopyFrom(biasadd_op)
+            new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0)
+            new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1)
+            new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2)
+            new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4
+
+            new_biasadd_requantize_op = mace_pb2.OperatorDef()
+            new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op)
+            new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4
+
+            new_b2s_op = mace_pb2.OperatorDef()
+            new_b2s_op.CopyFrom(b2s_op)
+            new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
+            new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1)
+            new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2)
+
+            new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op])
+            skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name])
+            visited_ops.add(op.name)
+
+            follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)]
+            for follow_op in follow_ops:
+              new_follow_op = mace_pb2.OperatorDef()
+              new_follow_op.CopyFrom(follow_op)
+              for i in xrange(len(follow_op.input)):
+                for k in xrange(3):
+                  if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
+                    new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
+              new_ops.append(new_follow_op)
+              skip_ops.add(follow_op.name)
+              visited_ops.add(follow_op.name)
+
+    visited_ops.add(op.name)
+
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend(tensor_map.values())
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+
+  return new_net_def
+
+def add_node_id(net_def):
+  node_id_counter = 0
+  node_id_map = {}
+  for tensor in net_def.tensors:
+    tensor.node_id = node_id_counter
+    node_id_counter += 1
+    tensor_op, port = get_op_and_port_from_tensor(tensor.name)
+    node_id_map[tensor_op] = tensor.node_id
+
+  for op in net_def.op:
+    op.node_id = node_id_counter
+    node_id_counter += 1
+    node_id_map[op.name] = op.node_id
+    for ipt in op.input:
+      op_name, port = get_op_and_port_from_tensor(ipt)
+      node_id = node_id_map[op_name]
+      node_input = op.node_input.add()
+      node_input.node_id = node_id
+      node_input.output_port = int(port)
+
+  return net_def
+
+def add_input_output_info(net_def, input_node, output_node, graph, dtype):
+  input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
+  output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
+
+  input_info = net_def.input_info.add()
+  input_info.dims.extend(input_tensor.shape.as_list())
+  input_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
+      input_info = net_def.input_info.add()
+      input_info.dims.extend([1,1,1,1])
+      input_info.data_type = mace_pb2.DT_FLOAT
+
+  output_info = net_def.output_info.add()
+  output_info.dims.extend(output_tensor.shape.as_list())
+  output_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
+      output_info = net_def.output_info.add()
+      output_info.dims.extend([1,1,1,1])
+      output_info.data_type = mace_pb2.DT_FLOAT
+
+  return net_def
+
+def fuse_quantize(net_def, input_node, output_node):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+
+  skip_ops = set()
+  new_ops = []
+  skip_tensors = set()
+
+  # INPUT->Flatten->Minf, Maxf->Quantize
+  for op in net_def.op:
+    if op.type == 'INPUT':
+      input_op = op
+      flatten_op = None
+      quantize_op = None
+      for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
+        if o.type == 'Flatten':
+          flatten_op = o
+        elif o.type == 'Quantize':
+          quantize_op = o
+      if quantize_op is not None:
+        minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
+        skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name])
+        skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+        quantize_op.type = 'AutoQuantize'
+        del quantize_op.input[1:]
+
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+  return new_net_def
+
+def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode):
+  """
+    nnlib does not have batch norm, so use tensorflow optimizer to fold
+     batch norm with convolution. The fold optimization reorders ops, so
+     we sort ops first by topology.
+  """
+  input_graph_def = tf.GraphDef()
+  with gfile.Open(model_file, "rb") as f:
+    data = f.read()
+    input_graph_def.ParseFromString(data)
+
+  input_graph_def = graph_util.sort_tf_graph(input_graph_def)
+  net_def = mace_pb2.NetDef()
+
+  with tf.Session() as session:
+    with session.graph.as_default() as graph:
+      tf.import_graph_def(input_graph_def, name="")
+      ops = graph.get_operations()
+      dsp_ops = DspOps()
+      resolved_ops = set()
+      # convert const node
+      unresolved_ops = [op for op in ops if op.type == 'Const']
+      while len(unresolved_ops) > 0:
+        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
+
+      # convert op node
+      unresolved_ops = [op for op in ops if op.type != 'Const']
+      while len(unresolved_ops) > 0:
+        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
+
+      add_output_node(net_def, output_node)
+      net_def = reverse_batch_to_space_and_biasadd(net_def)
+      net_def = fuse_quantize(net_def, input_node, output_node)
+
+      sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
+      net_def_with_node_id = add_node_id(sorted_net_def)
+
+      dtype = mace_pb2.DT_FLOAT
+      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
+
+      arg = final_net_def.arg.add()
+      arg.name = 'dsp_mode'
+      arg.i = dsp_mode
+
+  return final_net_def
+
diff --git a/tools/benchmark.sh b/tools/benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c327825546af73ed8cc8214a94b94b5d82cb46b0
--- /dev/null
+++ b/tools/benchmark.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/benchmark.sh model_output_dir"
+}
+
+if [ $# -lt 1 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+MODEL_OUTPUT_DIR=$1
+
+if [ -f "$MODEL_OUTPUT_DIR/benchmark_model" ]; then
+  rm -rf $MODEL_OUTPUT_DIR/benchmark_model
+fi
+
+if [ "$EMBED_MODEL_DATA" = 0 ]; then
+  cp codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
+fi
+
+if [ x"$TARGET_ABI" == x"host" ]; then
+  bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
+    --copt="-O3" \
+    --define openmp=true \
+    --define production=true || exit 1
+
+  cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR
+
+  MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
+  ${MODEL_OUTPUT_DIR}/benchmark_model \
+      --model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \
+      --device=${DEVICE_TYPE} \
+      --input_node="${INPUT_NODES}" \
+      --input_shape="${INPUT_SHAPES}"\
+      --output_node="${OUTPUT_NODES}" \
+      --output_shape="${OUTPUT_SHAPES}"\
+      --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} || exit 1
+
+else
+  bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \
+    --crosstool_top=//external:android/crosstool \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    --cpu=${TARGET_ABI} \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
+    --copt="-O3" \
+    --define openmp=true \
+    --define production=true || exit 1
+
+  cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR
+
+  adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
+  IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
+  for NAME in "${INPUT_NAMES[@]}";do
+    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+    adb push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} || exit 1
+  done
+  adb push ${MODEL_OUTPUT_DIR}/benchmark_model ${PHONE_DATA_DIR} || exit 1
+  if [ "$EMBED_MODEL_DATA" = 0 ]; then
+    adb push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
+  fi
+
+  adb </dev/null shell \
+    LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
+    MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
+    MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
+    MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
+    MACE_OPENCL_PROFILING=1 \
+    ${PHONE_DATA_DIR}/benchmark_model \
+    --model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \
+    --device=${DEVICE_TYPE} \
+    --input_node="${INPUT_NODES}" \
+    --input_shape="${INPUT_SHAPES}"\
+    --output_node="${OUTPUT_NODES}" \
+    --output_shape="${OUTPUT_SHAPES}"\
+    --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} || exit 1
+fi
diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..669918d28247a654a28d7792e24c218c6fd1660e
--- /dev/null
+++ b/tools/build_mace_run.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/build_mace_run.sh production_mode model_output_dir hexagon_mode"
+}
+
+if [ $# -lt 3 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+PRODUCTION_MODE=$1
+MODEL_OUTPUT_DIR=$2
+HEXAGON_MODE=$3
+
+if [ "$PRODUCTION_MODE" = 1 ]; then
+  PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
+fi
+
+if [ x"$TARGET_ABI" = x"host" ]; then
+  bazel build --verbose_failures -c opt --strip always //mace/codegen:generated_models \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
+    --define openmp=true \
+    --copt="-O3" \
+    $PRODUCTION_MODE_BUILD_FLAGS || exit 1
+
+  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
+    --define openmp=true \
+    --copt="-O3" \
+    $PRODUCTION_MODE_BUILD_FLAGS || exit 1
+else
+  if [ "$HEXAGON_MODE" = 1 ]; then
+    HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
+  fi
+
+  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
+    --crosstool_top=//external:android/crosstool \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    --cpu=${TARGET_ABI} \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
+    --define openmp=true \
+    --copt="-O3" \
+    $PRODUCTION_MODE_BUILD_FLAGS \
+    $HEXAGON_MODE_BUILD_FLAG || exit 1
+fi
+
+if [ "$PRODUCTION_MODE" = 1 ]; then
+  cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
+fi
+
+if [ -f "$MODEL_OUTPUT_DIR/mace_run" ]; then
+  rm -rf $MODEL_OUTPUT_DIR/mace_run
+fi
+cp bazel-bin/mace/examples/mace_run $MODEL_OUTPUT_DIR
+if [ "$EMBED_MODEL_DATA" = 0 ]; then
+  cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
+fi
+
+# copy model header file to build output dir
+cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.h $MODEL_OUTPUT_DIR
diff --git a/tools/build_production_code.sh b/tools/build_production_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7987f67fa7747177517aeef621b2a137ce4a9f9d
--- /dev/null
+++ b/tools/build_production_code.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/build_production_code.sh"
+}
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+build_host_target()
+{
+  BAZEL_TARGET=$1
+  bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    --copt="-O3" \
+    --define openmp=true || exit -1
+}
+
+build_target()
+{
+  BAZEL_TARGET=$1
+  bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
+    --crosstool_top=//external:android/crosstool \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+     --cpu=$TARGET_ABI \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-O3" \
+    --define openmp=true \
+    --copt="-DMACE_OBFUSCATE_LITERALS" || exit 1
+}
+
+if [ x"$TARGET_ABI" = x"host" ]; then
+  build_host_target //mace/codegen:generated_opencl_prod
+  build_host_target //mace/codegen:generated_tuning_params
+else
+  build_target //mace/codegen:generated_opencl_prod
+  build_target //mace/codegen:generated_tuning_params
+fi
diff --git a/tools/build_run_throughput_test.sh b/tools/build_run_throughput_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e87e2ff8b066e733fb7773583ea132e07c86b3bd
--- /dev/null
+++ b/tools/build_run_throughput_test.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/build_run_throughput_test.sh run_seconds merged_lib_file model_input_dir"
+}
+
+if [ $# -lt 1 ]; then
+  Usage
+  exit 1
+fi
+
+RUN_SECONDS=$1
+MERGED_LIB_FILE=$2
+MODEL_INPUT_DIR=$3
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+
+if [ "$CPU_MODEL_TAG" != '' ]; then
+  CPU_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_CPU_MODEL_TAG=${CPU_MODEL_TAG}"
+fi
+
+if [ "$GPU_MODEL_TAG" != '' ]; then
+  GPU_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_GPU_MODEL_TAG=${GPU_MODEL_TAG}"
+fi
+
+if [ "$DSP_MODEL_TAG" != '' ]; then
+  DSP_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_DSP_MODEL_TAG=${DSP_MODEL_TAG}"
+fi
+
+cp $MERGED_LIB_FILE benchmark/libmace_merged.a
+
+bazel build --verbose_failures -c opt --strip always benchmark:model_throughput_test \
+    --crosstool_top=//external:android/crosstool \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    --cpu=${TARGET_ABI} \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    ${CPU_MODEL_TAG_BUILD_FLAGS} \
+    ${GPU_MODEL_TAG_BUILD_FLAGS} \
+    ${DSP_MODEL_TAG_BUILD_FLAGS} \
+    --define openmp=true \
+    --copt="-O3" || exit 1
+
+rm benchmark/libmace_merged.a
+
+adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
+
+adb push ${MODEL_INPUT_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit 1
+adb push bazel-bin/benchmark/model_throughput_test ${PHONE_DATA_DIR} || exit 1
+if [ "$EMBED_MODEL_DATA" = 0 ]; then
+  adb push codegen/models/${CPU_MODEL_TAG}/${CPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
+  adb push codegen/models/${GPU_MODEL_TAG}/${GPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
+  adb push codegen/models/${DSP_MODEL_TAG}/${DSP_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
+fi
+adb push lib/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} || exit 1
+
+adb </dev/null shell \
+LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
+MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
+MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
+MACE_KERNEL_PATH=$KERNEL_DIR \
+MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
+${PHONE_DATA_DIR}/model_throughput_test \
+--input_shape="${INPUT_SHAPE}" \
+--output_shape="${OUTPUT_SHAPE}" \
+--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
+--cpu_model_data_file=${PHONE_DATA_DIR}/${CPU_MODEL_TAG}.data \
+--gpu_model_data_file=${PHONE_DATA_DIR}/${GPU_MODEL_TAG}.data \
+--dsp_model_data_file=${PHONE_DATA_DIR}/${DSP_MODEL_TAG}.data \
+--run_seconds=$RUN_SECONDS || exit 1
\ No newline at end of file
diff --git a/tools/clear_env.sh b/tools/clear_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ac66914b67c6f8984ac4f5f711691bcff038f13a
--- /dev/null
+++ b/tools/clear_env.sh
@@ -0,0 +1,9 @@
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+if [ x"$TARGET_ABI" != x"host" ]; then
+  adb shell rm -rf $PHONE_DATA_DIR
+fi
+
+rm -rf mace/codegen/models
+git checkout -- mace/codegen/opencl/opencl_compiled_program.cc mace/codegen/tuning/tuning_params.cc
diff --git a/tools/env.sh b/tools/env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0c8720d55ca1a11cad694c4de8697363c7f736a6
--- /dev/null
+++ b/tools/env.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+LIBMACE_TAG=`git describe --abbrev=0 --tags`
+
+MACE_SOURCE_DIR=`/bin/pwd`
+INPUT_FILE_NAME="model_input"
+OUTPUT_FILE_NAME="model_out"
+PHONE_DATA_DIR="/data/local/tmp/mace_run"
+KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
+CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
+MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/${MODEL_TAG}
+CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
+TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
+VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
+if [ -z ${EMBED_MODEL_DATA} ]; then
+  EMBED_MODEL_DATA=1
+fi
+
+if [ x"$RUNTIME" = x"dsp" ]; then
+  DATA_TYPE="DT_UINT8"
+  DEVICE_TYPE="HEXAGON"
+  LIB_FOLDER_NAME="${LIB_FOLDER_NAME}_dsp"
+elif [ x"$RUNTIME" = x"gpu" ]; then
+  DATA_TYPE="DT_HALF"
+  DEVICE_TYPE="OPENCL"
+elif [ x"$RUNTIME" = x"cpu" ]; then
+  DATA_TYPE="DT_FLOAT"
+  DEVICE_TYPE="CPU"
+fi
+
+GENERATED_MODEL_LIB_NAME="libgenerated_models.a"
+if [ x"$TARGET_ABI" = x"host" ]; then
+  GENERATED_MODEL_LIB_NAME="libgenerated_models.pic.a"
+fi
+GENERATED_MODEL_LIB_PATH="bazel-bin/mace/codegen/${GENERATED_MODEL_LIB_NAME}"
diff --git a/tools/example.yaml b/tools/example.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f55b62e18386bb318bb4ae5b98d40a417019e064
--- /dev/null
+++ b/tools/example.yaml
@@ -0,0 +1,44 @@
+# example.yaml
+# Each yaml file describes a exported library (could be named [target_abi]/libmace-${filename}.a), 
+# which can contains more than one models
+target_abis: [armeabi-v7a, arm64-v8a]
+target_socs: [MSM8953] # target_socs not enabled yet
+embed_model_data: 1
+vlog_level: 0
+models:
+  preview_net:
+    platform: tensorflow
+    model_file_path: path/to/model64.pb # also support http:// and https://
+    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
+    input_nodes: input_node
+    output_nodes: output_node
+    input_shapes: 1,64,64,3
+    output_shapes: 1,64,64,2
+    runtime: gpu
+    limit_opencl_kernel_time: 0
+    dsp_mode: 0
+    obfuscate: 1
+    fast_conv: 0
+  capture_net:
+    platform: caffe
+    model_file_path: path/to/model.prototxt
+    weight_file_path: path/to/weight.caffemodel
+    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
+    weight_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
+    input_nodes:
+      - input_node0
+      - input_node1
+    output_nodes:
+      - output_node0
+      - output_node1
+    input_shapes:
+      - 1,256,256,3
+      - 1,128,128,3
+    output_shapes:
+      - 1,256,256,2
+      - 1,1,1,2
+    runtime: cpu
+    limit_opencl_kernel_time: 1
+    dsp_mode: 0
+    obfuscate: 1
+    fast_conv: 0
diff --git a/tools/generate_data.py b/tools/generate_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..8feff8230dc7892d0a5e97d6ef7d7354989f2c9b
--- /dev/null
+++ b/tools/generate_data.py
@@ -0,0 +1,55 @@
+import argparse
+import sys
+import numpy as np
+import re
+
+# Validation Flow:
+# 1. Generate input data
+#    python generate_data.py \
+#        --input_node input_node \
+#        --input_shape 1,64,64,3 \
+#        --input_file input_file
+#
+
+def generate_data(name, shape):
+  np.random.seed()
+  data = np.random.random(shape) * 2 - 1
+  input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_', name)
+  print 'Generate input file: ', input_file_name
+  data.astype(np.float32).tofile(input_file_name)
+
+def main(unused_args):
+  input_names = [name for name in FLAGS.input_node.split(',')]
+  input_shapes = [shape for shape in FLAGS.input_shape.split(':')]
+  assert len(input_names) == len(input_shapes)
+  for i in range(len(input_names)):
+    shape = [int(x) for x in input_shapes[i].split(',')]
+    generate_data(input_names[i], shape)
+  print "Generate input file done."
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+    "--input_file",
+    type=str,
+    default="",
+    help="input file.")
+  parser.add_argument(
+    "--input_node",
+    type=str,
+    default="input_node",
+    help="input node")
+  parser.add_argument(
+    "--input_shape",
+    type=str,
+    default="1,64,64,3",
+    help="input shape.")
+
+  return parser.parse_known_args()
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
+
diff --git a/tools/generate_model_code.sh b/tools/generate_model_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e3baa975051d349868c864640abd0f651f5423f9
--- /dev/null
+++ b/tools/generate_model_code.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+bazel build //mace/python/tools:converter || exit 1
+rm -rf ${MODEL_CODEGEN_DIR}
+mkdir -p ${MODEL_CODEGEN_DIR}
+if [ ${DSP_MODE} ]; then
+  DSP_MODE_FLAG="--dsp_mode=${DSP_MODE}"
+fi
+
+PYTHONUNBUFFERED=1 bazel-bin/mace/python/tools/converter --platform=${PLATFORM} \
+                                     --model_file=${MODEL_FILE_PATH} \
+                                     --weight_file=${WEIGHT_FILE_PATH} \
+                                     --model_checksum=${MODEL_SHA256_CHECKSUM} \
+                                     --output=${MODEL_CODEGEN_DIR}/model.cc \
+                                     --input_node=${INPUT_NODES} \
+                                     --output_node=${OUTPUT_NODES} \
+                                     --data_type=${DATA_TYPE} \
+                                     --runtime=${RUNTIME} \
+                                     --output_type=source \
+                                     --template=${MACE_SOURCE_DIR}/mace/python/tools \
+                                     --model_tag=${MODEL_TAG} \
+                                     --input_shape=${INPUT_SHAPES} \
+                                     ${DSP_MODE_FLAG} \
+                                     --embed_model_data=${EMBED_MODEL_DATA} \
+                                     --winograd=${FAST_CONV} \
+                                     --obfuscate=${OBFUSCATE} || exit 1
diff --git a/tools/generate_opencl_and_version_code.sh b/tools/generate_opencl_and_version_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ee32acc7dac7fdfc83beb0aab1be26106efbf6e0
--- /dev/null
+++ b/tools/generate_opencl_and_version_code.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+python mace/python/tools/encrypt_opencl_codegen.py \
+    --cl_kernel_dir=./mace/kernels/opencl/cl/ \
+    --output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
+
+
+rm -rf ${CODEGEN_DIR}/version
+mkdir ${CODEGEN_DIR}/version
+bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit 1
diff --git a/tools/generate_production_code.sh b/tools/generate_production_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2015846cf63cc38ced0f465d24252c1c169787c6
--- /dev/null
+++ b/tools/generate_production_code.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/generate_production_code.sh cl_bin_dirs pull_or_not"
+}
+
+if [ $# -lt 2 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+CL_BIN_DIRS=$1
+PULL_OR_NOT=$2
+
+if [ "$PULL_OR_NOT" = 1 ]; then
+  CL_BIN_DIR=${CL_BIN_DIRS}
+  rm -rf ${CL_BIN_DIR}
+  mkdir -p ${CL_BIN_DIR}
+  if [ x"$TARGET_ABI" != x"host" ]; then
+    adb pull ${KERNEL_DIR}/. ${CL_BIN_DIR} > /dev/null
+    adb pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR} > /dev/null
+  fi
+fi
+
+python mace/python/tools/opencl_codegen.py \
+  --cl_binary_dirs=${CL_BIN_DIRS} \
+  --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc
+
+python mace/python/tools/binary_codegen.py \
+  --binary_dirs=${CL_BIN_DIRS} \
+  --binary_file_name=mace_run.config \
+  --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..556d77412dc92df2fd0a673c59b7fafcfe0dc90e
--- /dev/null
+++ b/tools/mace_tools.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+
+# Must run at root dir of libmace project.
+# python tools/mace_tools.py \
+#     --config=tools/example.yaml \
+#     --round=100 \
+#     --mode=all
+
+import argparse
+import hashlib
+import os
+import shutil
+import subprocess
+import sys
+import urllib
+import yaml
+
+from ConfigParser import ConfigParser
+
+def run_command(command):
+  print("Run command: {}".format(command))
+  result = subprocess.Popen(
+      command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  out, err = result.communicate()
+
+  if out:
+    print("Stdout msg:\n{}".format(out))
+  if err:
+    print("Stderr msg:\n{}".format(err))
+
+  if result.returncode != 0:
+    raise Exception("Exit not 0 from bash with code: {}, command: {}".format(
+        result.returncode, command))
+
+
+def get_global_runtime(configs):
+  runtime_list = []
+  for model_name in configs["models"]:
+    model_runtime = configs["models"][model_name]["runtime"]
+    runtime_list.append(model_runtime.lower())
+
+  global_runtime = ""
+  if "dsp" in runtime_list:
+    global_runtime = "dsp"
+  elif "gpu" in runtime_list:
+    global_runtime = "gpu"
+  elif "cpu" in runtime_list:
+    global_runtime = "cpu"
+  else:
+    raise Exception("Not found available RUNTIME in config files!")
+
+  return global_runtime
+
+
+def generate_opencl_and_version_code():
+  command = "bash tools/generate_opencl_and_version_code.sh"
+  run_command(command)
+
+
+def clear_env():
+  command = "bash tools/clear_env.sh"
+  run_command(command)
+
+
+def generate_random_input(model_output_dir):
+  generate_data_or_not = True
+  command = "bash tools/validate_tools.sh {} {}".format(
+      model_output_dir, int(generate_data_or_not))
+  run_command(command)
+
+
+def generate_model_code():
+  command = "bash tools/generate_model_code.sh"
+  run_command(command)
+
+
+def build_mace_run(production_mode, model_output_dir, hexagon_mode):
+  command = "bash tools/build_mace_run.sh {} {} {}".format(
+      int(production_mode), model_output_dir, int(hexagon_mode))
+  run_command(command)
+
+
+def tuning_run(model_output_dir, running_round, tuning, production_mode, restart_round):
+  command = "bash tools/tuning_run.sh {} {} {} {} {}".format(
+      model_output_dir, running_round, int(tuning), int(production_mode), restart_round)
+  run_command(command)
+
+
+def benchmark_model(model_output_dir):
+  command = "bash tools/benchmark.sh {}".format(model_output_dir)
+  run_command(command)
+
+
+def run_model(model_output_dir, running_round, restart_round):
+  tuning_run(model_output_dir, running_round, False, False, restart_round)
+
+
+def generate_production_code(model_output_dirs, pull_or_not):
+  cl_bin_dirs = []
+  for d in model_output_dirs:
+    cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
+  cl_bin_dirs_str = ",".join(cl_bin_dirs)
+  command = "bash tools/generate_production_code.sh {} {}".format(
+      cl_bin_dirs_str, int(pull_or_not))
+  run_command(command)
+
+
+def build_mace_run_prod(model_output_dir, tuning, global_runtime):
+  if "dsp" == global_runtime:
+    hexagon_mode = True
+  else:
+    hexagon_mode = False
+
+  production_or_not = False
+  build_mace_run(production_or_not, model_output_dir, hexagon_mode)
+  tuning_run(
+      model_output_dir,
+      running_round=0,
+      tuning=tuning,
+      production_mode=production_or_not,
+      restart_round=1)
+
+  production_or_not = True
+  pull_or_not = True
+  generate_production_code([model_output_dir], pull_or_not)
+  build_mace_run(production_or_not, model_output_dir, hexagon_mode)
+
+
+def build_run_throughput_test(run_seconds, merged_lib_file, model_input_dir):
+  command = "bash tools/build_run_throughput_test.sh {} {} {}".format(
+    run_seconds, merged_lib_file, model_input_dir)
+  run_command(command)
+
+
+def validate_model(model_output_dir):
+  generate_data_or_not = False
+  command = "bash tools/validate_tools.sh {} {}".format(
+      model_output_dir, int(generate_data_or_not))
+  run_command(command)
+
+
+def build_production_code():
+  command = "bash tools/build_production_code.sh"
+  run_command(command)
+
+
+def merge_libs_and_tuning_results(output_dir, model_output_dirs):
+  pull_or_not = False
+  generate_production_code(model_output_dirs, pull_or_not)
+  build_production_code()
+
+  model_output_dirs_str = ",".join(model_output_dirs)
+  command = "bash tools/merge_libs.sh {} {}".format(output_dir,
+                                                    model_output_dirs_str)
+  run_command(command)
+
+
+def parse_model_configs():
+  with open(FLAGS.config) as f:
+    configs = yaml.load(f)
+    return configs
+
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--config",
+      type=str,
+      default="./tool/config",
+      help="The global config file of models.")
+  parser.add_argument(
+      "--output_dir", type=str, default="build", help="The output dir.")
+  parser.add_argument(
+      "--round", type=int, default=1, help="The model running round.")
+  parser.add_argument("--run_seconds", type=int, default=10,
+                      help="The model throughput test running seconds.")
+  parser.add_argument(
+    "--restart_round", type=int, default=1, help="The model restart round.")
+  parser.add_argument(
+      "--tuning", type="bool", default="true", help="Tune opencl params.")
+  parser.add_argument("--mode", type=str, default="all",
+                      help="[build|run|validate|merge|all|throughput_test].")
+  return parser.parse_known_args()
+
+
+def main(unused_args):
+  configs = parse_model_configs()
+
+  if FLAGS.mode == "build" or FLAGS.mode == "all":
+    # Remove previous output dirs
+    if not os.path.exists(FLAGS.output_dir):
+      os.makedirs(FLAGS.output_dir)
+    elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")):
+      shutil.rmtree(os.path.join(FLAGS.output_dir, "libmace"))
+
+  if FLAGS.mode == "validate":
+    FLAGS.round = 1
+    FLAGS.restart_round = 1
+
+  os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"])
+  os.environ["VLOG_LEVEL"] = str(configs["vlog_level"])
+  os.environ["PROJECT_NAME"] = os.path.splitext(os.path.basename(FLAGS.config))[0]
+
+  generate_opencl_and_version_code()
+
+  for target_abi in configs["target_abis"]:
+    global_runtime = get_global_runtime(configs)
+    # Transfer params by environment
+    os.environ["TARGET_ABI"] = target_abi
+    model_output_dirs = []
+    for model_name in configs["models"]:
+      # Transfer params by environment
+      os.environ["MODEL_TAG"] = model_name
+      print '=======================', model_name, '======================='
+      model_config = configs["models"][model_name]
+      for key in model_config:
+        if key in ['input_nodes', 'output_nodes'] and isinstance(model_config[key], list):
+            os.environ[key.upper()] = ",".join(model_config[key])
+        elif key in ['input_shapes', 'output_shapes'] and isinstance(model_config[key], list):
+            os.environ[key.upper()] = ":".join(model_config[key])
+        else:
+          os.environ[key.upper()] = str(model_config[key])
+
+      md5 = hashlib.md5()
+      md5.update(model_config["model_file_path"])
+      model_path_digest = md5.hexdigest()
+      model_output_dir = "%s/%s/%s/%s" % (FLAGS.output_dir, model_name, model_path_digest, target_abi)
+      model_output_dirs.append(model_output_dir)
+
+      if FLAGS.mode == "build" or FLAGS.mode == "all":
+        if os.path.exists(model_output_dir):
+          shutil.rmtree(model_output_dir)
+        os.makedirs(model_output_dir)
+        clear_env()
+
+      # Support http:// and https://
+      if model_config["model_file_path"].startswith(
+          "http://") or model_config["model_file_path"].startswith("https://"):
+        os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
+        urllib.urlretrieve(model_config["model_file_path"], os.environ["MODEL_FILE_PATH"])
+
+      if model_config["platform"] == "caffe" and (model_config["weight_file_path"].startswith(
+          "http://") or model_config["weight_file_path"].startswith("https://")):
+        os.environ["WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
+        urllib.urlretrieve(model_config["weight_file_path"], os.environ["WEIGHT_FILE_PATH"])
+
+      if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\
+          or FLAGS.mode == "benchmark" or FLAGS.mode == "all":
+        generate_random_input(model_output_dir)
+
+      if FLAGS.mode == "build" or FLAGS.mode == "all":
+        generate_model_code()
+        build_mace_run_prod(model_output_dir, FLAGS.tuning, global_runtime)
+
+      if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all":
+        run_model(model_output_dir, FLAGS.round, FLAGS.restart_round)
+
+      if FLAGS.mode == "benchmark":
+        benchmark_model(model_output_dir)
+
+      if FLAGS.mode == "validate" or FLAGS.mode == "all":
+        validate_model(model_output_dir)
+
+    if FLAGS.mode == "build" or FLAGS.mode == "merge" or FLAGS.mode == "all":
+      merge_libs_and_tuning_results(FLAGS.output_dir + "/" + target_abi,
+                                    model_output_dirs)
+
+  if FLAGS.mode == "throughput_test":
+    merged_lib_file = FLAGS.output_dir + "/%s/libmace/lib/libmace_%s.a" % \
+        (configs["target_abis"][0], os.environ["PROJECT_NAME"])
+    generate_random_input(FLAGS.output_dir)
+    for model_name in configs["models"]:
+      runtime = configs["models"][model_name]["runtime"]
+      os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name
+    build_run_throughput_test(FLAGS.run_seconds, merged_lib_file, FLAGS.output_dir)
+
+
+if __name__ == "__main__":
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
diff --git a/tools/merge_libs.sh b/tools/merge_libs.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c872e8617436df12f4968936f4e9e163958de94f
--- /dev/null
+++ b/tools/merge_libs.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/merge_libs.sh libmace_output_dir model_output_dirs"
+}
+
+if [ $# -lt 2 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+LIBMACE_BUILD_DIR=$1
+MODEL_OUTPUT_DIRS=$2
+MODEL_OUTPUT_DIRS_ARR=(${MODEL_OUTPUT_DIRS//,/ })
+MODEL_HEADER_DIR=${LIBMACE_BUILD_DIR}/libmace/include/mace/public
+MODEL_DATA_DIR=${LIBMACE_BUILD_DIR}/libmace/data
+
+rm -rf ${LIBMACE_BUILD_DIR}/libmace
+mkdir -p ${LIBMACE_BUILD_DIR}/libmace/include/mace/public
+mkdir -p ${LIBMACE_BUILD_DIR}/libmace/lib
+mkdir -p ${MODEL_DATA_DIR}
+cp -rf ${MACE_SOURCE_DIR}/mace/public/*.h ${LIBMACE_BUILD_DIR}/libmace/include/mace/public/
+cp ${MACE_SOURCE_DIR}/mace/core/runtime/hexagon/libhexagon_controller.so ${LIBMACE_BUILD_DIR}/libmace/lib
+
+LIBMACE_TEMP_DIR=`mktemp -d -t libmace.XXXX`
+
+# Merge all libraries in to one
+echo "create ${LIBMACE_BUILD_DIR}/libmace/lib/libmace_${PROJECT_NAME}.a" > ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+
+if [ x"$TARGET_ABI" = x"host" ]; then
+  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+else
+  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/codegen/libgenerated_version.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/core/libcore.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/core/libopencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/kernels/libkernels.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/utils/libutils.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/utils/libutils_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+fi
+
+for model_output_dir in ${MODEL_OUTPUT_DIRS_ARR[@]}; do
+  for lib in ${model_output_dir}/*.a; do
+    echo "addlib ${lib}" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  done
+  for data_file in ${model_output_dir}/*.data; do
+    cp ${data_file} ${MODEL_DATA_DIR}
+  done
+  for header_file in ${model_output_dir}/*.h; do
+    cp ${header_file} ${MODEL_HEADER_DIR}
+  done
+done
+echo "save" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+echo "end" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+$ANDROID_NDK_HOME/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-ar \
+    -M < ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri || exit 1
+
+rm -rf ${LIBMACE_TEMP_DIR}
+
+echo "Libs merged!"
diff --git a/tools/tuning_run.sh b/tools/tuning_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e8a0f8bc54363b8ba3d02fd18efe4b110263791b
--- /dev/null
+++ b/tools/tuning_run.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/tuning_run.sh model_output_dir round tuning production_mode"
+}
+
+if [ $# -lt 4 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+MODEL_OUTPUT_DIR=$1
+ROUND=$2
+TUNING_OR_NOT=$3
+PRODUCTION_MODE=$4
+RESTART_ROUND=$5
+
+if [ x"$TARGET_ABI" = x"host" ]; then
+  MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
+  ${MODEL_OUTPUT_DIR}/mace_run \
+    --input_node="${INPUT_NODES}" \
+    --input_shape="${INPUT_SHAPES}"\
+    --output_node="${OUTPUT_NODES}" \
+    --output_shape="${OUTPUT_SHAPES}"\
+    --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \
+    --output_file=${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME} \
+    --model_data_file=${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data \
+    --device=${DEVICE_TYPE}   \
+    --round=1 \
+    --restart_round=1 || exit 1
+else
+  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != 1 ]];then
+    tuning_flag=1
+  else
+    tuning_flag=0
+  fi
+  
+  adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
+  if [ "$PRODUCTION_MODE" = 0 ]; then
+    adb shell "mkdir -p ${KERNEL_DIR}" || exit 1
+  fi
+
+  IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
+  for NAME in "${INPUT_NAMES[@]}";do
+    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+    adb push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} > /dev/null || exit 1
+  done
+
+  adb </dev/null push ${MODEL_OUTPUT_DIR}/mace_run ${PHONE_DATA_DIR} > /dev/null || exit 1
+  if [ "$EMBED_MODEL_DATA" = 0 ]; then
+    adb push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} > /dev/null || exit 1
+  fi
+  adb push mace/core/runtime/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} > /dev/null || exit 1
+  
+  mace_adb_output=`adb </dev/null shell \
+    "LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
+    MACE_TUNING=${tuning_flag} \
+    MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
+    MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
+    MACE_KERNEL_PATH=$KERNEL_DIR \
+    MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
+    ${PHONE_DATA_DIR}/mace_run \
+    --input_node="${INPUT_NODES}" \
+    --input_shape="${INPUT_SHAPES}"\
+    --output_node="${OUTPUT_NODES}" \
+    --output_shape="${OUTPUT_SHAPES}"\
+    --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
+    --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
+    --model_data_file=${PHONE_DATA_DIR}/${MODEL_TAG}.data \
+    --device=${DEVICE_TYPE}   \
+    --round=$ROUND \
+    --restart_round=$RESTART_ROUND; echo \\$?"` || exit 1
+  echo "$mace_adb_output" | head -n -1
+
+  mace_adb_return_code=`echo "$mace_adb_output" | tail -1`
+  if [ $mace_adb_return_code -ne 0 ]; then
+    exit 1
+  fi
+fi
diff --git a/tools/validate.py b/tools/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..4978a995d96bad226f288bb89a348d826153ecd4
--- /dev/null
+++ b/tools/validate.py
@@ -0,0 +1,183 @@
+import argparse
+import sys
+import os
+import os.path
+import numpy as np
+import re
+from scipy import spatial
+
+# Validation Flow:
+# 1. Generate input data
+# 2. Use mace_run to run model on phone.
+# 3. adb pull the result.
+# 4. Compare output data of mace and tf
+#    python validate.py --model_file tf_model_opt.pb \
+#        --input_file input_file \
+#        --mace_out_file output_file \
+#        --input_node input_node \
+#        --output_node output_node \
+#        --input_shape 1,64,64,3 \
+#        --output_shape 1,64,64,2
+
+def load_data(file):
+  if os.path.isfile(file):
+    return np.fromfile(file=file, dtype=np.float32)
+  else:
+    return np.empty([0])
+
+def format_output_name(name):
+  return re.sub('[^0-9a-zA-Z]+', '_', name)
+
+def compare_output(output_name, mace_out_value, out_value):
+  if mace_out_value.size != 0:
+    similarity = (1 - spatial.distance.cosine(out_value.flat, mace_out_value.flat))
+    print output_name, 'MACE VS', FLAGS.platform.upper(), 'similarity: ', similarity
+    if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \
+        (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \
+        (FLAGS.mace_runtime == "dsp" and similarity > 0.930):
+      print '=======================Similarity Test Passed======================'
+    else:
+      print '=======================Similarity Test Failed======================'
+      sys.exit(-1)
+  else:
+    print '=======================Skip empty node==================='
+    sys.exit(-1)
+
+
+def validate_tf_model(input_names, input_shapes, output_names):
+  import tensorflow as tf
+  if not os.path.isfile(FLAGS.model_file):
+    print("Input graph file '" + FLAGS.model_file + "' does not exist!")
+    sys.exit(-1)
+
+  input_graph_def = tf.GraphDef()
+  with open(FLAGS.model_file, "rb") as f:
+    data = f.read()
+    input_graph_def.ParseFromString(data)
+    tf.import_graph_def(input_graph_def, name="")
+
+    with tf.Session() as session:
+      with session.graph.as_default() as graph:
+        tf.import_graph_def(input_graph_def, name="")
+        input_dict = {}
+        for i in range(len(input_names)):
+          input_value = load_data(FLAGS.input_file + "_" + input_names[i])
+          input_value = input_value.reshape(input_shapes[i])
+          input_node = graph.get_tensor_by_name(input_names[i] + ':0')
+          input_dict[input_node] = input_value
+
+        output_nodes = []
+        for name in output_names:
+          output_nodes.extend([graph.get_tensor_by_name(name + ':0')])
+        output_values = session.run(output_nodes, feed_dict=input_dict)
+        for i in range(len(output_names)):
+          output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i])
+          mace_out_value = load_data(output_file_name)
+          compare_output(output_names[i], mace_out_value, output_values[i])
+
+def validate_caffe_model(input_names, input_shapes, output_names, output_shapes):
+  os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints
+  import caffe
+  if not os.path.isfile(FLAGS.model_file):
+    print("Input graph file '" + FLAGS.model_file + "' does not exist!")
+    sys.exit(-1)
+  if not os.path.isfile(FLAGS.weight_file):
+    print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
+    sys.exit(-1)
+
+  caffe.set_mode_cpu()
+
+  net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file)
+
+  for i in range(len(input_names)):
+    input_value = load_data(FLAGS.input_file + "_" + input_names[i])
+    input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2))
+    net.blobs[input_names[i]].data[0] = input_value
+
+  net.forward()
+
+  for i in range(len(output_names)):
+    value = net.blobs[net.top_names[output_names[i]][0]].data[0]
+    out_shape = output_shapes[i]
+    out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[1], out_shape[2]
+    value = value.reshape(out_shape).transpose((0, 2, 3, 1))
+    output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i])
+    mace_out_value = load_data(output_file_name)
+    compare_output(output_names[i], mace_out_value, value)
+
+def main(unused_args):
+  input_names = [name for name in FLAGS.input_node.split(',')]
+  input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')]
+  input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs]
+  output_names = [name for name in FLAGS.output_node.split(',')]
+  assert len(input_names) == len(input_shapes)
+
+  if FLAGS.platform == 'tensorflow':
+    validate_tf_model(input_names, input_shapes, output_names)
+  elif FLAGS.platform == 'caffe':
+    output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')]
+    output_shapes = [[int(x) for x in shape.split(',')] for shape in output_shape_strs]
+    validate_caffe_model(input_names, input_shapes, output_names, output_shapes)
+
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+    "--platform",
+    type=str,
+    default="",
+    help="Tensorflow or Caffe.")
+  parser.add_argument(
+    "--model_file",
+    type=str,
+    default="",
+    help="TensorFlow or Caffe \'GraphDef\' file to load.")
+  parser.add_argument(
+    "--weight_file",
+    type=str,
+    default="",
+    help="caffe model file to load.")
+  parser.add_argument(
+    "--input_file",
+    type=str,
+    default="",
+    help="input file.")
+  parser.add_argument(
+    "--mace_out_file",
+    type=str,
+    default="",
+    help="mace output file to load.")
+  parser.add_argument(
+    "--mace_runtime",
+    type=str,
+    default="gpu",
+    help="mace runtime device.")
+  parser.add_argument(
+    "--input_shape",
+    type=str,
+    default="1,64,64,3",
+    help="input shape.")
+  parser.add_argument(
+    "--output_shape",
+    type=str,
+    default="1,64,64,2",
+    help="output shape.")
+  parser.add_argument(
+    "--input_node",
+    type=str,
+    default="input_node",
+    help="input node")
+  parser.add_argument(
+    "--output_node",
+    type=str,
+    default="output_node",
+    help="output node")
+
+  return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
+
diff --git a/tools/validate_tools.sh b/tools/validate_tools.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6a458676dcaced33e878da197acc545123e0aeb8
--- /dev/null
+++ b/tools/validate_tools.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+Usage() {
+  echo "Usage: bash tools/validate_tools.sh model_output_dir generate_data_or_not"
+}
+
+if [ $# -lt 2 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+MODEL_OUTPUT_DIR=$1
+GENERATE_DATA_OR_NOT=$2
+
+IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
+IFS=',' read -r -a OUTPUT_NAMES <<< "${OUTPUT_NODES}"
+
+echo $MODEL_OUTPUT_DIR
+if [ "$GENERATE_DATA_OR_NOT" = 1 ]; then
+  for NAME in "${INPUT_NAMES[@]}";do
+    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+    rm -rf ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME}
+  done
+  python -u tools/generate_data.py --input_node=${INPUT_NODES} \
+    --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \
+    --input_shape="${INPUT_SHAPES}" || exit 1
+  exit 0
+fi
+
+if [ "$PLATFORM" == "tensorflow" ];then
+  if [[ x"$TARGET_ABI" != x"host" ]]; then
+    for NAME in "${OUTPUT_NAMES[@]}";do
+      FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+      rm -rf ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME}
+      adb pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${MODEL_OUTPUT_DIR} > /dev/null
+    done
+  fi
+  python -u tools/validate.py --platform=tensorflow \
+      --model_file ${MODEL_FILE_PATH} \
+      --input_file ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} \
+      --mace_out_file ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME} \
+      --mace_runtime ${RUNTIME} \
+      --input_node ${INPUT_NODES} \
+      --output_node ${OUTPUT_NODES} \
+      --input_shape ${INPUT_SHAPES} \
+      --output_shape ${OUTPUT_SHAPES} || exit 1
+
+elif [ "$PLATFORM" == "caffe" ];then
+  IMAGE_NAME=mace-caffe:latest
+  CONTAINER_NAME=mace_caffe_validator
+  RES_FILE=validation.result
+
+  if [[ "$(docker images -q mace-caffe:latest 2> /dev/null)" == "" ]]; then
+    echo "Build caffe docker"
+    docker build -t ${IMAGE_NAME} docker/caffe || exit 1
+  fi
+
+  if [ ! "$(docker ps -qa -f name=${CONTAINER_NAME})" ]; then
+    echo "Run caffe container"
+    docker run -d -it --name ${CONTAINER_NAME} ${IMAGE_NAME} /bin/bash || exit 1
+  fi
+
+  if [ "$(docker inspect -f {{.State.Running}} ${CONTAINER_NAME})" == "false" ];then
+    echo "Start caffe container"
+    docker start ${CONTAINER_NAME}
+  fi
+
+  for NAME in "${INPUT_NAMES[@]}";do
+    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+    docker cp ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${CONTAINER_NAME}:/mace
+  done
+
+  if [[ x"$TARGET_ABI" != x"host" ]]; then
+    for NAME in "${OUTPUT_NAMES[@]}";do
+      FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+      rm -rf ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME}
+      adb pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${MODEL_OUTPUT_DIR} > /dev/null
+    done
+  fi
+  for NAME in "${OUTPUT_NAMES[@]}";do
+    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
+    docker cp ${MODEL_OUTPUT_DIR}/${OUTPUT_FILE_NAME}_${FORMATTED_NAME} ${CONTAINER_NAME}:/mace
+  done
+
+  MODEL_FILE_NAME=$(basename ${MODEL_FILE_PATH})
+  WEIGHT_FILE_NAME=$(basename ${WEIGHT_FILE_PATH})
+  docker cp tools/validate.py ${CONTAINER_NAME}:/mace
+  docker cp ${MODEL_FILE_PATH} ${CONTAINER_NAME}:/mace
+  docker cp ${WEIGHT_FILE_PATH} ${CONTAINER_NAME}:/mace
+  docker exec -it ${CONTAINER_NAME} python -u /mace/validate.py \
+    --platform=caffe \
+    --model_file /mace/${MODEL_FILE_NAME} \
+    --weight_file /mace/${WEIGHT_FILE_NAME} \
+    --input_file /mace/${INPUT_FILE_NAME} \
+    --mace_out_file /mace/${OUTPUT_FILE_NAME} \
+    --mace_runtime ${RUNTIME} \
+    --input_node ${INPUT_NODES} \
+    --output_node ${OUTPUT_NODES} \
+    --input_shape ${INPUT_SHAPES} \
+    --output_shape ${OUTPUT_SHAPES} || exit 1
+
+fi