merge with origin master

08a362e7 · yejianwu · ed267833 · 8358e491 · 08a362e7 · 08a362e7
26 changed file
--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -8,6 +8,7 @@ package(
 cc_library(
    name = "generated_models",
    srcs = glob(["models/*/*.cc"]),
+    hdrs = glob(["models/*/*.h"]),
    linkstatic = 1,
    deps = [
        "//mace/core",
@@ -33,7 +34,6 @@ cc_library(
    linkstatic = 1,
 )

-
 cc_library(
    name = "generated_version",
    srcs = ["version/version.cc"],

--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -10,6 +10,7 @@
 #include <map>

 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"

 namespace mace {


--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
--- a/mace/core/mace_runtime.cc
+++ b/mace/core/mace_runtime.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/public/mace_runtime.h"
+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+
+namespace mace {
+
+void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
+                         GPUPriorityHint gpu_priority_hint) {
+  VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
+          << ", gpu_priority_hint: " << gpu_priority_hint;
+  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
+}
+
+void ConfigOmpThreadsAndAffinity(int omp_num_threads,
+                                 CPUPowerOption power_option) {
+  VLOG(1) << "Config CPU Runtime: omp_num_threads: " << omp_num_threads
+          << ", cpu_power_option: " << power_option;
+  SetOmpThreadsAndAffinity(omp_num_threads, power_option);
+}
+
+};  // namespace mace
--- a/mace/core/mace_types.cc
+++ b/mace/core/mace_types.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <memory>
+#include <numeric>
+
+#include "mace/public/mace_types.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+ConstTensor::ConstTensor(const std::string &name,
+                         const unsigned char *data,
+                         const std::vector<int64_t> &dims,
+                         const DataType data_type,
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(data_type),
+      node_id_(node_id) {}
+
+ConstTensor::ConstTensor(const std::string &name,
+                         const unsigned char *data,
+                         const std::vector<int64_t> &dims,
+                         const int data_type,
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(static_cast<DataType>(data_type)),
+      node_id_(node_id) {}
+
+const std::string &ConstTensor::name() const { return name_; }
+const unsigned char *ConstTensor::data() const { return data_; }
+int64_t ConstTensor::data_size() const { return data_size_; }
+const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
+DataType ConstTensor::data_type() const { return data_type_; }
+uint32_t ConstTensor::node_id() const { return node_id_; }
+
+Argument::Argument() : has_bits_(0) {}
+
+void Argument::CopyFrom(const Argument &from) {
+  this->name_ = from.name();
+  this->f_ = from.f();
+  this->i_ = from.i();
+  this->s_ = from.s();
+  auto floats = from.floats();
+  this->floats_.resize(floats.size());
+  std::copy(floats.begin(), floats.end(), this->floats_.begin());
+  auto ints = from.ints();
+  this->ints_.resize(ints.size());
+  std::copy(ints.begin(), ints.end(), this->ints_.begin());
+  auto strings = from.floats();
+  this->strings_.resize(strings.size());
+  std::copy(floats.begin(), floats.end(), this->floats_.begin());
+
+  this->has_bits_ = from.has_bits_;
+}
+const std::string &Argument::name() const { return name_; }
+void Argument::set_name(const std::string &value) { name_ = value; }
+bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
+void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
+float Argument::f() const { return f_; }
+void Argument::set_f(float value) {
+  set_has_f();
+  f_ = value;
+}
+bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
+void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
+int64_t Argument::i() const { return i_; }
+void Argument::set_i(int64_t value) {
+  set_has_i();
+  i_ = value;
+}
+bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
+void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
+std::string Argument::s() const { return s_; }
+void Argument::set_s(const std::string &value) {
+  set_has_s();
+  s_ = value;
+}
+const std::vector<float> &Argument::floats() const { return floats_; }
+void Argument::add_floats(float value) { floats_.push_back(value); }
+void Argument::set_floats(const std::vector<float> &value) {
+  floats_.resize(value.size());
+  std::copy(value.begin(), value.end(), floats_.begin());
+}
+const std::vector<int64_t> &Argument::ints() const { return ints_; }
+void Argument::add_ints(int64_t value) { ints_.push_back(value); }
+void Argument::set_ints(const std::vector<int64_t> &value) {
+  ints_.resize(value.size());
+  std::copy(value.begin(), value.end(), ints_.begin());
+}
+const std::vector<std::string> &Argument::strings() const { return strings_; }
+void Argument::add_strings(const ::std::string &value) {
+  strings_.push_back(value);
+}
+void Argument::set_strings(const std::vector<std::string> &value) {
+  strings_.resize(value.size());
+  std::copy(value.begin(), value.end(), strings_.begin());
+}
+
+// Node Input
+NodeInput::NodeInput(int node_id, int output_port)
+    : node_id_(node_id), output_port_(output_port) {}
+void NodeInput::CopyFrom(const NodeInput &from) {
+  node_id_ = from.node_id();
+  output_port_ = from.output_port();
+}
+int NodeInput::node_id() const { return node_id_; }
+void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
+int NodeInput::output_port() const { return output_port_; }
+void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
+
+// OutputShape
+OutputShape::OutputShape() {}
+OutputShape::OutputShape(const std::vector<int64_t> &dims)
+    : dims_(dims.begin(), dims.end()) {}
+void OutputShape::CopyFrom(const OutputShape &from) {
+  auto from_dims = from.dims();
+  dims_.resize(from_dims.size());
+  std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
+}
+const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
+
+// Operator Def
+void OperatorDef::CopyFrom(const OperatorDef &from) {
+  name_ = from.name();
+  type_ = from.type();
+
+  auto from_input = from.input();
+  input_.resize(from_input.size());
+  std::copy(from_input.begin(), from_input.end(), input_.begin());
+  auto from_output = from.output();
+  output_.resize(from_output.size());
+  std::copy(from_output.begin(), from_output.end(), output_.begin());
+  auto from_arg = from.arg();
+  arg_.resize(from_arg.size());
+  for (int i = 0; i < from_arg.size(); ++i) {
+    arg_[i].CopyFrom(from_arg[i]);
+  }
+  auto from_output_shape = from.output_shape();
+  output_shape_.resize(from_output_shape.size());
+  for (int i = 0; i < from_output_shape.size(); ++i) {
+    output_shape_[i].CopyFrom(from_output_shape[i]);
+  }
+  auto from_data_type = from.output_type();
+  output_type_.resize(from_data_type.size());
+  std::copy(from_data_type.begin(), from_data_type.end(), output_type_.begin());
+
+  auto mem_ids = from.mem_id();
+  mem_id_.resize(mem_ids.size());
+  std::copy(mem_ids.begin(), mem_ids.end(), mem_id_.begin());
+
+  // nnlib
+  node_id_ = from.node_id();
+  op_id_ = from.op_id();
+  padding_ = from.padding();
+  auto from_node_input = from.node_input();
+  node_input_.resize(from_node_input.size());
+  for (int i = 0; i < from_node_input.size(); ++i) {
+    node_input_[i].CopyFrom(from_node_input[i]);
+  }
+  auto from_out_max_byte_size = from.out_max_byte_size();
+  out_max_byte_size_.resize(from_out_max_byte_size.size());
+  std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
+            out_max_byte_size_.begin());
+
+  has_bits_ = from.has_bits_;
+}
+
+const std::string &OperatorDef::name() const { return name_; }
+void OperatorDef::set_name(const std::string &name_) {
+  set_has_name();
+  OperatorDef::name_ = name_;
+}
+bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &OperatorDef::type() const { return type_; }
+void OperatorDef::set_type(const std::string &type_) {
+  set_has_type();
+  OperatorDef::type_ = type_;
+}
+bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
+void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
+const std::vector<int> &OperatorDef::mem_id() const { return mem_id_; }
+void OperatorDef::set_mem_id(const std::vector<int> &value) {
+  mem_id_.resize(value.size());
+  std::copy(value.begin(), value.end(), mem_id_.begin());
+}
+uint32_t OperatorDef::node_id() const { return node_id_; }
+void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
+uint32_t OperatorDef::op_id() const { return op_id_; }
+uint32_t OperatorDef::padding() const { return padding_; }
+void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
+const std::vector<NodeInput> &OperatorDef::node_input() const {
+  return node_input_;
+}
+void OperatorDef::add_node_input(const NodeInput &value) {
+  node_input_.push_back(value);
+}
+const std::vector<int> &OperatorDef::out_max_byte_size() const {
+  return out_max_byte_size_;
+}
+void OperatorDef::add_out_max_byte_size(int value) {
+  out_max_byte_size_.push_back(value);
+}
+const std::vector<std::string> &OperatorDef::input() const { return input_; }
+const std::string &OperatorDef::input(int index) const {
+  MACE_CHECK(0 <= index && index <= input_.size());
+  return input_[index];
+}
+std::string *OperatorDef::add_input() {
+  input_.push_back("");
+  return &input_.back();
+}
+void OperatorDef::add_input(const ::std::string &value) {
+  input_.push_back(value);
+}
+void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
+void OperatorDef::set_input(const std::vector<std::string> &value) {
+  input_.resize(value.size());
+  std::copy(value.begin(), value.end(), input_.begin());
+}
+const std::vector<std::string> &OperatorDef::output() const { return output_; }
+const std::string &OperatorDef::output(int index) const {
+  MACE_CHECK(0 <= index && index <= output_.size());
+  return output_[index];
+}
+std::string *OperatorDef::add_output() {
+  output_.push_back("");
+  return &output_.back();
+}
+void OperatorDef::add_output(const ::std::string &value) {
+  output_.push_back(value);
+}
+void OperatorDef::add_output(::std::string &&value) {
+  output_.push_back(value);
+}
+void OperatorDef::set_output(const std::vector<std::string> &value) {
+  output_.resize(value.size());
+  std::copy(value.begin(), value.end(), output_.begin());
+}
+const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
+Argument *OperatorDef::add_arg() {
+  arg_.emplace_back(Argument());
+  return &arg_.back();
+}
+const std::vector<OutputShape> &OperatorDef::output_shape() const {
+  return output_shape_;
+}
+void OperatorDef::add_output_shape(const OutputShape &value) {
+  output_shape_.push_back(value);
+}
+const std::vector<DataType> &OperatorDef::output_type() const {
+  return output_type_;
+}
+void OperatorDef::set_output_type(const std::vector<DataType> &value) {
+  output_type_.resize(value.size());
+  std::copy(value.begin(), value.end(), output_type_.begin());
+}
+
+// MemoryBlock
+MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
+    : mem_id_(mem_id), x_(x), y_(y) {}
+
+int MemoryBlock::mem_id() const { return mem_id_; }
+uint32_t MemoryBlock::x() const { return x_; }
+uint32_t MemoryBlock::y() const { return y_; }
+
+// MemoryArena
+const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
+  return mem_block_;
+}
+std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
+  return mem_block_;
+}
+int MemoryArena::mem_block_size() const { return mem_block_.size(); }
+
+// InputInfo
+const std::string &InputInfo::name() const { return name_; }
+int32_t InputInfo::node_id() const { return node_id_; }
+int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
+DataType InputInfo::data_type() const { return data_type_; }
+const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
+
+// OutputInfo
+const std::string &OutputInfo::name() const { return name_; }
+int32_t OutputInfo::node_id() const { return node_id_; }
+int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
+DataType OutputInfo::data_type() const { return data_type_; }
+void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
+const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
+void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
+
+// NetDef
+NetDef::NetDef() : has_bits_(0) {}
+
+const std::string &NetDef::name() const { return name_; }
+void NetDef::set_name(const std::string &value) {
+  set_has_name();
+  name_ = value;
+}
+bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &NetDef::version() const { return version_; }
+void NetDef::set_version(const std::string &value) {
+  set_has_version();
+  version_ = value;
+}
+bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
+void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
+const std::vector<OperatorDef> &NetDef::op() const { return op_; }
+OperatorDef *NetDef::add_op() {
+  op_.emplace_back(OperatorDef());
+  return &op_.back();
+}
+std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
+const std::vector<Argument> &NetDef::arg() const { return arg_; }
+Argument *NetDef::add_arg() {
+  arg_.emplace_back(Argument());
+  return &arg_.back();
+}
+std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
+const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
+std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
+const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
+MemoryArena &NetDef::mutable_mem_arena() {
+  set_has_mem_arena();
+  return mem_arena_;
+}
+bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
+void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
+const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
+const std::vector<OutputInfo> &NetDef::output_info() const {
+  return output_info_;
+}
+std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
+
+int NetDef::op_size() const { return op_.size(); }
+
+const OperatorDef &NetDef::op(const int idx) const {
+  MACE_CHECK(0 <= idx && idx < op_size());
+  return op_[idx];
+}
+
+};  // namespace mace
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -16,6 +16,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"

 namespace mace {


--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -6,7 +6,7 @@
 #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_

-#include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"

 namespace mace {


--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -14,6 +14,7 @@
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
+#include "mace/public/mace_runtime.h"
 #include "mace/utils/timer.h"

 namespace mace {

--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -8,7 +8,7 @@
 #include <cstdint>
 #include <string>

-#include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 #include "include/half.hpp"

 namespace mace {

--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -23,9 +23,11 @@

 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"

+// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
 namespace mace {
 namespace MACE_MODEL_TAG {

@@ -145,19 +147,26 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  return curr;
 }

-DEFINE_string(input_node, "input_node0,input_node1",
+DEFINE_string(input_node,
+              "input_node0,input_node1",
              "input nodes, separated by comma");
-DEFINE_string(input_shape, "1,224,224,3:1,1,1,10",
+DEFINE_string(input_shape,
+              "1,224,224,3:1,1,1,10",
              "input shapes, separated by colon and comma");
-DEFINE_string(output_node, "output_node0,output_node1",
+DEFINE_string(output_node,
+              "output_node0,output_node1",
              "output nodes, separated by comma");
-DEFINE_string(output_shape, "1,224,224,2:1,1,1,10",
+DEFINE_string(output_shape,
+              "1,224,224,2:1,1,1,10",
              "output shapes, separated by colon and comma");
-DEFINE_string(input_file, "",
+DEFINE_string(input_file,
+              "",
              "input file name | input file prefix for multiple inputs.");
-DEFINE_string(output_file, "",
+DEFINE_string(output_file,
+              "",
              "output file name | output file prefix for multiple outputs");
-DEFINE_string(model_data_file, "",
+DEFINE_string(model_data_file,
+              "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
 DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
 DEFINE_int32(round, 1, "round");
@@ -166,115 +175,14 @@ DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
 DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, 8, "num of openmp threads");
-DEFINE_int32(cpu_power_option, 0,
+DEFINE_int32(cpu_power_option,
+             0,
             "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");

-bool SingleInputAndOutput(const std::vector<int64_t> &input_shape,
-                          const std::vector<int64_t> &output_shape) {
-  // load model
-  int64_t t0 = NowMicros();
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-  int64_t t1 = NowMicros();
-  LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
-  int64_t init_micros = t1 - t0;
-
-  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  LOG(INFO) << "Runing with device type: " << device_type;
-
-  // config runtime
-  if (device_type == DeviceType::OPENCL) {
-    mace::ConfigOpenCLRuntime(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  } else if (device_type == DeviceType::CPU) {
-    mace::ConfigOmpThreadsAndAffinity(
-        FLAGS_omp_num_threads,
-        static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
-  }
-
-  // Init model
-  LOG(INFO) << "Run init";
-  t0 = NowMicros();
-  mace::MaceEngine engine(&net_def, device_type);
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
-  t1 = NowMicros();
-  init_micros += t1 - t0;
-  LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
-  LOG(INFO) << "Total init latency: " << init_micros << " us";
-
-  // Allocate input and output
-  int64_t input_size =
-      std::accumulate(input_shape.begin(), input_shape.end(), 1,
-                      std::multiplies<int64_t>());
-  int64_t output_size =
-      std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                      std::multiplies<int64_t>());
-  std::unique_ptr<float[]> input_data(new float[input_size]);
-  std::unique_ptr<float[]> output_data(new float[output_size]);
-
-  // load input
-  std::ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node),
-                        std::ios::in | std::ios::binary);
-  if (in_file.is_open()) {
-    in_file.read(reinterpret_cast<char *>(input_data.get()),
-                 input_size * sizeof(float));
-    in_file.close();
-  } else {
-    LOG(INFO) << "Open input file failed";
-    return -1;
-  }
-
-  LOG(INFO) << "Warm up run";
-  t0 = NowMicros();
-  engine.Run(input_data.get(), input_shape, output_data.get());
-  t1 = NowMicros();
-  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
-
-  if (FLAGS_round > 0) {
-    LOG(INFO) << "Run model";
-    t0 = NowMicros();
-    struct mallinfo prev = mallinfo();
-    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(input_data.get(), input_shape, output_data.get());
-      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
-        LOG(INFO) << "=== check malloc info change #" << i << " ===";
-        prev = LogMallinfoChange(prev);
-      }
-    }
-    t1 = NowMicros();
-    LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
-  }
-
-  if (FLAGS_restart_round == 1) {
-    if (output_data != nullptr) {
-      std::string
-        output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node);
-      std::ofstream out_file(output_name, std::ios::binary);
-      out_file.write((const char *) (output_data.get()),
-                     output_size * sizeof(float));
-      out_file.flush();
-      out_file.close();
-      LOG(INFO) << "Write output file "
-                << output_name
-                << " with size " << output_size
-                << " done.";
-    } else {
-      LOG(INFO) << "Output data is null";
-    }
-  }
-
-  return true;
-}
-
-bool MultipleInputOrOutput(
-    const std::vector<std::string> &input_names,
-    const std::vector<std::vector<int64_t>> &input_shapes,
-    const std::vector<std::string> &output_names,
-    const std::vector<std::vector<int64_t>> &output_shapes) {
+bool RunModel(const std::vector<std::string> &input_names,
+              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<std::string> &output_names,
+              const std::vector<std::vector<int64_t>> &output_shapes) {
  // load model
  int64_t t0 = NowMicros();
  const unsigned char *model_data =
@@ -312,42 +220,42 @@ bool MultipleInputOrOutput(

  const size_t input_count = input_names.size();
  const size_t output_count = output_names.size();
-  std::vector<mace::MaceInputInfo> input_infos(input_count);
-  std::map<std::string, float*> outputs;
-  std::vector<std::unique_ptr<float[]>> input_datas(input_count);
+
+  std::map<std::string, mace::MaceTensor> inputs;
+  std::map<std::string, mace::MaceTensor> outputs;
  for (size_t i = 0; i < input_count; ++i) {
    // Allocate input and output
    int64_t input_size =
        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
-    input_datas[i].reset(new float[input_size]);
+    auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                            std::default_delete<float[]>());
    // load input
    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                          std::ios::in | std::ios::binary);
    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
+      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
                   input_size * sizeof(float));
      in_file.close();
    } else {
      LOG(INFO) << "Open input file failed";
      return -1;
    }
-    input_infos[i].name = input_names[i];
-    input_infos[i].shape = input_shapes[i];
-    input_infos[i].data = input_datas[i].get();
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
  }
-  std::vector<std::unique_ptr<float[]>> output_datas(output_count);
+
  for (size_t i = 0; i < output_count; ++i) {
    int64_t output_size =
        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
-    output_datas[i].reset(new float[output_size]);
-    outputs[output_names[i]] = output_datas[i].get();
+    auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                             std::default_delete<float[]>());
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
  }

  LOG(INFO) << "Warm up run";
  t0 = NowMicros();
-  engine.Run(input_infos, outputs);
+  engine.Run(inputs, &outputs);
  t1 = NowMicros();
  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";

@@ -356,7 +264,7 @@ bool MultipleInputOrOutput(
    t0 = NowMicros();
    struct mallinfo prev = mallinfo();
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(input_infos, outputs);
+      engine.Run(inputs, &outputs);
      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
        LOG(INFO) << "=== check malloc info change #" << i << " ===";
        prev = LogMallinfoChange(prev);
@@ -367,20 +275,19 @@ bool MultipleInputOrOutput(
  }

  for (size_t i = 0; i < output_count; ++i) {
-    std::string output_name = FLAGS_output_file + "_"
-        + FormatName(output_names[i]);
+    std::string output_name =
+        FLAGS_output_file + "_" + FormatName(output_names[i]);
    std::ofstream out_file(output_name, std::ios::binary);
    int64_t output_size =
        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                        std::multiplies<int64_t>());
-    out_file.write((const char *) outputs[output_names[i]],
-                   output_size * sizeof(float));
+    out_file.write(
+        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
+        output_size * sizeof(float));
    out_file.flush();
    out_file.close();
-    LOG(INFO) << "Write output file "
-              << output_name
-              << " with size " << output_size
-              << " done.";
+    LOG(INFO) << "Write output file " << output_name << " with size "
+              << output_size << " done.";
  }

  return true;
@@ -391,7 +298,6 @@ int Main(int argc, char **argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "mace git version: " << MaceGitVersion();
  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
  LOG(INFO) << "input node: " << FLAGS_input_node;
  LOG(INFO) << "input shape: " << FLAGS_input_shape;
@@ -431,14 +337,8 @@ int Main(int argc, char **argv) {
 #pragma omp parallel for
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;
-    if (input_count == 1 && output_count == 1) {
-      ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]);
-    } else {
-      ret = MultipleInputOrOutput(input_names,
-                                  input_shape_vec,
-                                  output_names,
-                                  output_shape_vec);
-    }
+    ret =
+        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
  }
  if (ret) {
    return 0;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -95,7 +95,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                      static_cast<uint32_t>(buffer->buffer_offset() /
                                            GetEnumTypeSize(buffer->dtype())));
  }
-  if (type == ARGUMENT) {
+  if (type == CONV2D_FILTER) {
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+  } else if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
  } else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -2,6 +2,7 @@

 __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
                                     __private const int input_offset,
+                                     __private const int filter_h,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
@@ -22,16 +23,18 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  }
 #endif

-  const int out_channel_idx = h * 4;
-  const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
-  const int hw_idx = w / rounded_in_channel;
-  const int in_channel_idx = w % rounded_in_channel;
+  const int in_channel_idx = w;
+  const int hw_size = filter_w * filter_h;
+  const int out_channel_idx = h / hw_size * 4;
+  const int hw_idx = h % hw_size;
  const int h_idx = hw_idx / filter_w;
  const int w_idx = hw_idx % filter_w;
-  const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
-                           + in_channel_idx;
+  const int offset = input_offset
+                     + ((h_idx * filter_w + w_idx) * out_channel
+                         + out_channel_idx) * in_channel
+                     + in_channel_idx;

-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
  if (out_channel_idx < out_channel) {
    const int size = out_channel - out_channel_idx;
    if (size < 4) {
@@ -52,10 +55,11 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  }

  int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }

 __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
+                                     __private const int filter_h,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
@@ -76,18 +80,19 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
  }
 #endif

-  const int out_channel_idx = h * 4;
-  const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
-  const int hw_idx = w / rounded_in_channel;
-  const int in_channel_idx = w % rounded_in_channel;
+  const int in_channel_idx = w;
+  const int hw_size = filter_w * filter_h;
+  const int out_channel_idx = h / hw_size * 4;
+  const int hw_idx = h % hw_size;
  const int h_idx = hw_idx / filter_w;
  const int w_idx = hw_idx % filter_w;
-  const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
-                           + in_channel_idx;
+  const int offset = ((h_idx * filter_w + w_idx) * out_channel
+                         + out_channel_idx) * in_channel
+                     + in_channel_idx;

  if (out_channel_idx < out_channel) {
    int2 coord = (int2)(w, h);
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+    DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
    const int size = (out_channel - out_channel_idx);
    if (size < 4) {
      switch (size) {
@@ -200,7 +205,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                           + channel_idx;

  const int size = channels - channel_idx;
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
  if (size < 4) {
    switch(size) {
      case 3:
@@ -214,7 +219,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
    values = vload4(0, input + offset);
  }
  int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }

 __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
@@ -246,7 +251,7 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                           + channel_idx;

  int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
  const int size = channels - channel_idx;
  if (size < 4) {
    switch (size) {
@@ -286,7 +291,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  const int size = count - w * 4;


-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
  if (size < 4) {
    switch(size) {
      case 3:
@@ -300,7 +305,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
    values = vload4(0, input + offset);
  }
  int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }

 __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
@@ -325,7 +330,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
  const int offset = w * 4;

  int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
  const int size = count - offset;
  if (size < 4) {
    switch (size) {

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>

 __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                      __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
+                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -41,8 +41,6 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  const int out_w_blks = get_global_size(1);
 #endif

-  const int rounded_in_ch = in_ch_blks << 2;
-
 #ifdef BIAS
  DATA_TYPE4 out0 =
     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
@@ -64,21 +62,21 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  const int height_idx = mad24((out_hb % out_height), stride, -padding_top);

  const int batch_idx = mul24((out_hb / out_height), in_height);
-  const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
+  const int filter_hw = mul24(filter_width, filter_height);

  DATA_TYPE4 in0, in1, in2, in3;
  DATA_TYPE4 weights0, weights1, weights2, weights3;
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    const int in_idx = mul24(in_ch_blk, in_width);
-    int filter_x_part0 = in_ch_blk << 2;
+    int filter_x_idx = in_ch_blk << 2;
+    int filter_y_idx = mul24(out_ch_blk, filter_hw);
    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-      // TODO(heliangliang) optimize out these muls
      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,
                           (in_hb_value < 0 || in_hb_value >= in_height));

-      int filter_x_part1 = 0;
+#pragma unroll
      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
        int in_width_value;
 #define READ_INPUT(i)                                                                \
@@ -96,11 +94,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 #undef READ_INPUT

        // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
-        int filter_idx = filter_x_part0 + filter_x_part1;
-        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));

        out0 = mad(in0.x, weights0, out0);
        out0 = mad(in0.y, weights1, out0);
@@ -123,9 +120,8 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
        out3 = mad(in3.z, weights2, out3);
        out3 = mad(in3.w, weights3, out3);

-        filter_x_part1 += rounded_in_ch;
+        filter_y_idx += 1;
      }
-      filter_x_part0 += rounded_in_ch_x_filter_width;
    }
  }


--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>

 __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                          __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
+                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -39,8 +39,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blks = get_global_size(1);
 #endif

-  const int rounded_in_ch = in_ch_blks << 2;
-
 #ifdef BIAS
  DATA_TYPE4 out0 =
     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
@@ -65,19 +63,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int height_idx = mad24((out_hb % out_height), stride, -padding_top);

  const int batch_idx = mul24((out_hb / out_height), in_height);
-  const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;

  DATA_TYPE4 in0, in1, in2, in3, in4;
  DATA_TYPE4 weights0, weights1, weights2, weights3;
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    const int in_idx = mul24(in_ch_blk, in_width);
-    int filter_x_part0 = in_ch_blk << 2;
+    int filter_x_idx = in_ch_blk << 2;
+    int filter_y_idx = mul24(out_ch_blk, 9);
    int in_hb_idx = height_idx;
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
      int in_hb_value = select(in_hb_idx + batch_idx,
                               -1,
                               (in_hb_idx < 0 || in_hb_idx >= in_height));
-      int filter_x_part1 = 0;
      int in_width_idx = 0;
      for (short width_idx = 0; width_idx < 3; ++width_idx) {
        int in_width_value;
@@ -97,11 +94,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #undef READ_INPUT

        // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
-        int filter_idx = filter_x_part0 + filter_x_part1;
-        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));

        out0 = mad(in0.x, weights0, out0);
        out0 = mad(in0.y, weights1, out0);
@@ -129,10 +125,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
        out4 = mad(in4.z, weights2, out4);
        out4 = mad(in4.w, weights3, out4);

-        filter_x_part1 += rounded_in_ch;
        in_width_idx += dilation_w;
+        filter_y_idx += 1;
      }
-      filter_x_part0 += rounded_in_ch_x_3;
      in_hb_idx += dilation_h;
    }
  }

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -23,13 +23,13 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
  (*image_shape)[1] = shape[0] * shape[1];
 }

-// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
+// [RoundUp<4>(Ic), H * W * (Oc + 3) / 4]
 void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
                               std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
-  (*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
-  (*image_shape)[1] = RoundUpDiv4(shape[2]);
+  (*image_shape)[0] = RoundUp<index_t>(shape[3], 4);
+  (*image_shape)[1] = shape[0] * shape[1] * RoundUpDiv4(shape[2]);
 }

 // [H * W * M, (Ic + 3) / 4]

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -114,6 +114,7 @@ static void Conv2d(int iters,
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);

+
 BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256);

 BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, 1, VALID, 1024);
@@ -135,6 +136,8 @@ BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);

+BM_CONV_2D(1, 1024, 16, 16, 15, 1, 1, 1, SAME, 2);
+
 // Dilation
 BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
 BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);

--- a/mace/public/BUILD
+++ b/mace/public/BUILD
@@ -7,11 +7,11 @@ package(

 licenses(["notice"])  # Apache 2.0

-load("//mace:mace.bzl", "if_android")
-
 cc_library(
    name = "public",
    hdrs = [
        "mace.h",
+        "mace_runtime.h",
+        "mace_types.h",
    ],
 )
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -2,6 +2,9 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

+// This file defines core MACE APIs.
+// There APIs will be stable and backward compatible.
+
 #ifndef MACE_PUBLIC_MACE_H_
 #define MACE_PUBLIC_MACE_H_

@@ -13,415 +16,60 @@

 namespace mace {

-#define MACE_MAJOR_VERSION 0
-#define MACE_MINOR_VERSION 1
-#define MACE_PATCH_VERSION 0
-
-// MACE_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
-// "-beta", "-rc", "-rc.1")
-#define MACE_VERSION_SUFFIX ""
-
-#define MACE_STR_HELPER(x) #x
-#define MACE_STR(x) MACE_STR_HELPER(x)
-
-// e.g. "0.5.0" or "0.6.0-alpha".
-#define MACE_VERSION_STRING                                                    \
-  (MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
-      MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
-
-inline const char *MaceVersion() { return MACE_VERSION_STRING; }
-
-extern const char *MaceGitVersion();
-
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname &) = delete;   \
-  classname &operator=(const classname &) = delete
-#endif
-
-enum NetMode { INIT = 0, NORMAL = 1 };
+const char *MaceVersion();

 enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };

-enum DataType {
-  DT_INVALID = 0,
-  DT_FLOAT = 1,
-  DT_DOUBLE = 2,
-  DT_INT32 = 3,
-  DT_UINT8 = 4,
-  DT_INT16 = 5,
-  DT_INT8 = 6,
-  DT_STRING = 7,
-  DT_INT64 = 8,
-  DT_UINT16 = 9,
-  DT_BOOL = 10,
-  DT_HALF = 19,
-  DT_UINT32 = 22
-};
-
-enum GPUPerfHint {
-  PERF_DEFAULT = 0,
-  PERF_LOW = 1,
-  PERF_NORMAL = 2,
-  PERF_HIGH = 3
-};
-
-enum GPUPriorityHint {
-  PRIORITY_DEFAULT = 0,
-  PRIORITY_LOW = 1,
-  PRIORITY_NORMAL = 2,
-  PRIORITY_HIGH = 3
-};
-
-enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2};
-
-class ConstTensor {
- public:
-  ConstTensor(const std::string &name,
-              const unsigned char *data,
-              const std::vector<int64_t> &dims,
-              const DataType data_type = DT_FLOAT,
-              uint32_t node_id = 0);
-  ConstTensor(const std::string &name,
-              const unsigned char *data,
-              const std::vector<int64_t> &dims,
-              const int data_type,
-              uint32_t node_id = 0);
-
-  const std::string &name() const;
-  const unsigned char *data() const;
-  int64_t data_size() const;
-  const std::vector<int64_t> &dims() const;
-  DataType data_type() const;
-  uint32_t node_id() const;
-
- private:
-  const std::string name_;
-  const unsigned char *data_;
-  const int64_t data_size_;
-  const std::vector<int64_t> dims_;
-  const DataType data_type_;
-  const uint32_t node_id_;
-};
-
-class Argument {
- public:
-  Argument();
-  void CopyFrom(const Argument &from);
-
- public:
-  const std::string &name() const;
-  void set_name(const std::string &value);
-  bool has_f() const;
-  float f() const;
-  void set_f(float value);
-  bool has_i() const;
-  int64_t i() const;
-  void set_i(int64_t value);
-  bool has_s() const;
-  std::string s() const;
-  void set_s(const std::string &value);
-  const std::vector<float> &floats() const;
-  void add_floats(float value);
-  void set_floats(const std::vector<float> &value);
-  const std::vector<int64_t> &ints() const;
-  void add_ints(int64_t value);
-  void set_ints(const std::vector<int64_t> &value);
-  const std::vector<std::string> &strings() const;
-  void add_strings(const ::std::string &value);
-  void set_strings(const std::vector<std::string> &value);
-
- private:
-  void set_has_f();
-  void set_has_i();
-  void set_has_s();
-
- private:
-  std::string name_;
-  float f_;
-  int64_t i_;
-  std::string s_;
-  std::vector<float> floats_;
-  std::vector<int64_t> ints_;
-  std::vector<std::string> strings_;
-  uint32_t has_bits_;
-};
-
-class NodeInput {
- public:
-  NodeInput() {}
-  NodeInput(int node_id, int output_port);
-  void CopyFrom(const NodeInput &from);
-
- public:
-  int node_id() const;
-  void set_node_id(int node_id);
-  int output_port() const;
-  void set_output_port(int output_port);
-
- private:
-  int node_id_;
-  int output_port_;
-};
-
-class OutputShape {
- public:
-  OutputShape();
-  OutputShape(const std::vector<int64_t> &dims);  // NOLINT(runtime/explicit)
-  void CopyFrom(const OutputShape &from);
-
- public:
-  const std::vector<int64_t> &dims() const;
-
- private:
-  std::vector<int64_t> dims_;
-};
-
-class OperatorDef {
- public:
-  void CopyFrom(const OperatorDef &from);
-
- public:
-  const std::string &name() const;
-  void set_name(const std::string &name_);
-  bool has_name() const;
-  const std::string &type() const;
-  void set_type(const std::string &type_);
-  bool has_type() const;
-  const std::vector<int> &mem_id() const;
-  void set_mem_id(const std::vector<int> &value);
-  uint32_t node_id() const;
-  void set_node_id(uint32_t node_id);
-  uint32_t op_id() const;
-  uint32_t padding() const;
-  void set_padding(uint32_t padding);
-  const std::vector<NodeInput> &node_input() const;
-  void add_node_input(const NodeInput &value);
-  const std::vector<int> &out_max_byte_size() const;
-  void add_out_max_byte_size(int value);
-  const std::vector<std::string> &input() const;
-  const std::string &input(int index) const;
-  std::string *add_input();
-  void add_input(const ::std::string &value);
-  void add_input(::std::string &&value);
-  void set_input(const std::vector<std::string> &value);
-  const std::vector<std::string> &output() const;
-  const std::string &output(int index) const;
-  std::string *add_output();
-  void add_output(const ::std::string &value);
-  void add_output(::std::string &&value);
-  void set_output(const std::vector<std::string> &value);
-  const std::vector<Argument> &arg() const;
-  Argument *add_arg();
-  const std::vector<OutputShape> &output_shape() const;
-  void add_output_shape(const OutputShape &value);
-  const std::vector<DataType> &output_type() const;
-  void set_output_type(const std::vector<DataType> &value);
-
- private:
-  void set_has_name();
-  void set_has_type();
-  void set_has_mem_id();
-
- private:
-  std::string name_;
-  std::string type_;
-
-  std::vector<std::string> input_;
-  std::vector<std::string> output_;
-  std::vector<Argument> arg_;
-  std::vector<OutputShape> output_shape_;
-  std::vector<DataType> output_type_;
-
-  std::vector<int> mem_id_;
-
-  // nnlib
-  uint32_t node_id_;
-  uint32_t op_id_;
-  uint32_t padding_;
-  std::vector<NodeInput> node_input_;
-  std::vector<int> out_max_byte_size_;
-
-  uint32_t has_bits_;
-};
-
-class MemoryBlock {
- public:
-  MemoryBlock(int mem_id, uint32_t x, uint32_t y);
-
- public:
-  int mem_id() const;
-  uint32_t x() const;
-  uint32_t y() const;
-
- private:
-  int mem_id_;
-  uint32_t x_;
-  uint32_t y_;
-};
-
-class MemoryArena {
- public:
-  const std::vector<MemoryBlock> &mem_block() const;
-  std::vector<MemoryBlock> &mutable_mem_block();
-  int mem_block_size() const;
-
- private:
-  std::vector<MemoryBlock> mem_block_;
-};
+enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };

-// for hexagon mace-nnlib
-class InputInfo {
+// MACE input/output tensor
+class MaceTensor {
 public:
-  const std::string &name() const;
-  int32_t node_id() const;
-  int32_t max_byte_size() const;
-  DataType data_type() const;
-  const std::vector<int32_t> &dims() const;
+  // shape - the shape of the tensor, with size n
+  // data - the buffer of the tensor, must not be null with size equals
+  //        shape[0] * shape[1] * ... * shape[n-1]
+  explicit MaceTensor(const std::vector<int64_t> &shape,
+                      std::shared_ptr<float> data);
+  MaceTensor();
+  MaceTensor(const MaceTensor &other);
+  MaceTensor(const MaceTensor &&other);
+  MaceTensor &operator=(const MaceTensor &other);
+  MaceTensor &operator=(const MaceTensor &&other);
+  ~MaceTensor();

- private:
-  std::string name_;
-  int32_t node_id_;
-  int32_t max_byte_size_;  // only support 32-bit len
-  DataType data_type_;
-  std::vector<int32_t> dims_;
-};
-
-class OutputInfo {
- public:
-  const std::string &name() const;
-  int32_t node_id() const;
-  int32_t max_byte_size() const;
-  DataType data_type() const;
-  void set_data_type(DataType data_type);
-  const std::vector<int32_t> &dims() const;
-  void set_dims(const std::vector<int32_t> &dims);
+  const std::vector<int64_t> &shape() const;
+  const std::shared_ptr<float> data() const;
+  std::shared_ptr<float> data();

 private:
-  std::string name_;
-  int32_t node_id_;
-  int32_t max_byte_size_;  // only support 32-bit len
-  DataType data_type_;
-  std::vector<int32_t> dims_;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
 };

-class NetDef {
- public:
-  NetDef();
-  int op_size() const;
-
-  const OperatorDef &op(const int idx) const;
-
- public:
-  const std::string &name() const;
-  bool has_name() const;
-  void set_name(const std::string &value);
-  const std::string &version() const;
-  bool has_version() const;
-  void set_version(const std::string &value);
-
-  const std::vector<OperatorDef> &op() const;
-  OperatorDef *add_op();
-  std::vector<OperatorDef> &mutable_op();
-  const std::vector<Argument> &arg() const;
-  Argument *add_arg();
-  std::vector<Argument> &mutable_arg();
-  const std::vector<ConstTensor> &tensors() const;
-  std::vector<ConstTensor> &mutable_tensors();
-  const MemoryArena &mem_arena() const;
-  bool has_mem_arena() const;
-  MemoryArena &mutable_mem_arena();
-  const std::vector<InputInfo> &input_info() const;
-  const std::vector<OutputInfo> &output_info() const;
-  std::vector<OutputInfo> &mutable_output_info();
-
- private:
-  void set_has_name();
-  void set_has_version();
-  void set_has_mem_arena();
-
- private:
-  std::string name_;
-  std::string version_;
-  std::vector<OperatorDef> op_;
-  std::vector<Argument> arg_;
-  std::vector<ConstTensor> tensors_;
-
-  // for mem optimization
-  MemoryArena mem_arena_;
-
-  // for hexagon mace-nnlib
-  std::vector<InputInfo> input_info_;
-  std::vector<OutputInfo> output_info_;
-
-  uint32_t has_bits_;
-};
-
-struct CallStats {
-  int64_t start_micros;
-  int64_t end_micros;
-};
-
-struct OperatorStats {
-  std::string operator_name;
-  std::string type;
-  CallStats stats;
-};
-
-struct RunMetadata {
-  std::vector<OperatorStats> op_stats;
-};
-
-class Workspace;
-class NetBase;
-class OperatorRegistry;
-class HexagonControlWrapper;
-
-struct MaceInputInfo {
-  std::string name;
-  std::vector<int64_t> shape;
-  const float *data;
-};
-
-void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
-void ConfigOmpThreadsAndAffinity(int omp_num_threads,
-                                 CPUPowerOption power_option);
+class NetDef;
+class RunMetadata;

 class MaceEngine {
 public:
-  // Single input and output
-  explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
-  // Multiple input or output
  explicit MaceEngine(const NetDef *net_def,
                      DeviceType device_type,
                      const std::vector<std::string> &input_nodes,
                      const std::vector<std::string> &output_nodes);
  ~MaceEngine();
-  // Single input and output
-  bool Run(const float *input,
-           const std::vector<int64_t> &input_shape,
-           float *output);
-  // Single input and output for benchmark
-  bool Run(const float *input,
-           const std::vector<int64_t> &input_shape,
-           float *output,
-           RunMetadata *run_metadata);
-  // Multiple input or output
-  bool Run(
-      const std::vector<MaceInputInfo> &input,
-      std::map<std::string, float *> &output,  // NOLINT(runtime/references)
-      RunMetadata *run_metadata = nullptr);
-  MaceEngine(const MaceEngine &) = delete;
-  MaceEngine &operator=(const MaceEngine &) = delete;
+
+  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
+                 std::map<std::string, MaceTensor> *outputs);
+
+  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
+                 std::map<std::string, MaceTensor> *outputs,
+                 RunMetadata *run_metadata);

 private:
-  std::shared_ptr<OperatorRegistry> op_registry_;
-  DeviceType device_type_;
-  std::unique_ptr<Workspace> ws_;
-  std::unique_ptr<NetBase> net_;
-  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+  MaceEngine(const MaceEngine &) = delete;
+  MaceEngine &operator=(const MaceEngine &) = delete;
 };

 }  // namespace mace

--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+// This file defines runtime tuning APIs.
+// These APIs are not stable.
+
+#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
+#define MACE_PUBLIC_MACE_RUNTIME_H_
+
+namespace mace {
+
+enum GPUPerfHint {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+};
+
+enum GPUPriorityHint {
+  PRIORITY_DEFAULT = 0,
+  PRIORITY_LOW = 1,
+  PRIORITY_NORMAL = 2,
+  PRIORITY_HIGH = 3
+};
+
+enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 };
+
+void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
+void ConfigOmpThreadsAndAffinity(int omp_num_threads,
+                                 CPUPowerOption power_option);
+
+}  // namespace mace
+
+#endif  // MACE_PUBLIC_MACE_RUNTIME_H_
--- a/mace/public/mace_types.h
+++ b/mace/public/mace_types.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+// This file defines data types used by net creation and benchmark tools.
+// These APIs are not stable and should only be used by advanced users.
+
+#ifndef MACE_PUBLIC_MACE_TYPES_H_
+#define MACE_PUBLIC_MACE_TYPES_H_
+
+#include <string>
+#include <vector>
+
+namespace mace {
+
+// Disable the copy and assignment operator for a class.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+ private:                                  \
+  classname(const classname &) = delete;   \
+  classname &operator=(const classname &) = delete
+#endif
+
+enum NetMode { INIT = 0, NORMAL = 1 };
+
+enum DataType {
+  DT_INVALID = 0,
+  DT_FLOAT = 1,
+  DT_DOUBLE = 2,
+  DT_INT32 = 3,
+  DT_UINT8 = 4,
+  DT_INT16 = 5,
+  DT_INT8 = 6,
+  DT_STRING = 7,
+  DT_INT64 = 8,
+  DT_UINT16 = 9,
+  DT_BOOL = 10,
+  DT_HALF = 19,
+  DT_UINT32 = 22
+};
+
+class ConstTensor {
+ public:
+  ConstTensor(const std::string &name,
+              const unsigned char *data,
+              const std::vector<int64_t> &dims,
+              const DataType data_type = DT_FLOAT,
+              uint32_t node_id = 0);
+  ConstTensor(const std::string &name,
+              const unsigned char *data,
+              const std::vector<int64_t> &dims,
+              const int data_type,
+              uint32_t node_id = 0);
+
+  const std::string &name() const;
+  const unsigned char *data() const;
+  int64_t data_size() const;
+  const std::vector<int64_t> &dims() const;
+  DataType data_type() const;
+  uint32_t node_id() const;
+
+ private:
+  const std::string name_;
+  const unsigned char *data_;
+  const int64_t data_size_;
+  const std::vector<int64_t> dims_;
+  const DataType data_type_;
+  const uint32_t node_id_;
+};
+
+class Argument {
+ public:
+  Argument();
+  void CopyFrom(const Argument &from);
+
+ public:
+  const std::string &name() const;
+  void set_name(const std::string &value);
+  bool has_f() const;
+  float f() const;
+  void set_f(float value);
+  bool has_i() const;
+  int64_t i() const;
+  void set_i(int64_t value);
+  bool has_s() const;
+  std::string s() const;
+  void set_s(const std::string &value);
+  const std::vector<float> &floats() const;
+  void add_floats(float value);
+  void set_floats(const std::vector<float> &value);
+  const std::vector<int64_t> &ints() const;
+  void add_ints(int64_t value);
+  void set_ints(const std::vector<int64_t> &value);
+  const std::vector<std::string> &strings() const;
+  void add_strings(const ::std::string &value);
+  void set_strings(const std::vector<std::string> &value);
+
+ private:
+  void set_has_f();
+  void set_has_i();
+  void set_has_s();
+
+ private:
+  std::string name_;
+  float f_;
+  int64_t i_;
+  std::string s_;
+  std::vector<float> floats_;
+  std::vector<int64_t> ints_;
+  std::vector<std::string> strings_;
+  uint32_t has_bits_;
+};
+
+class NodeInput {
+ public:
+  NodeInput() {}
+  NodeInput(int node_id, int output_port);
+  void CopyFrom(const NodeInput &from);
+
+ public:
+  int node_id() const;
+  void set_node_id(int node_id);
+  int output_port() const;
+  void set_output_port(int output_port);
+
+ private:
+  int node_id_;
+  int output_port_;
+};
+
+class OutputShape {
+ public:
+  OutputShape();
+  OutputShape(const std::vector<int64_t> &dims);  // NOLINT(runtime/explicit)
+  void CopyFrom(const OutputShape &from);
+
+ public:
+  const std::vector<int64_t> &dims() const;
+
+ private:
+  std::vector<int64_t> dims_;
+};
+
+class OperatorDef {
+ public:
+  void CopyFrom(const OperatorDef &from);
+
+ public:
+  const std::string &name() const;
+  void set_name(const std::string &name_);
+  bool has_name() const;
+  const std::string &type() const;
+  void set_type(const std::string &type_);
+  bool has_type() const;
+  const std::vector<int> &mem_id() const;
+  void set_mem_id(const std::vector<int> &value);
+  uint32_t node_id() const;
+  void set_node_id(uint32_t node_id);
+  uint32_t op_id() const;
+  uint32_t padding() const;
+  void set_padding(uint32_t padding);
+  const std::vector<NodeInput> &node_input() const;
+  void add_node_input(const NodeInput &value);
+  const std::vector<int> &out_max_byte_size() const;
+  void add_out_max_byte_size(int value);
+  const std::vector<std::string> &input() const;
+  const std::string &input(int index) const;
+  std::string *add_input();
+  void add_input(const ::std::string &value);
+  void add_input(::std::string &&value);
+  void set_input(const std::vector<std::string> &value);
+  const std::vector<std::string> &output() const;
+  const std::string &output(int index) const;
+  std::string *add_output();
+  void add_output(const ::std::string &value);
+  void add_output(::std::string &&value);
+  void set_output(const std::vector<std::string> &value);
+  const std::vector<Argument> &arg() const;
+  Argument *add_arg();
+  const std::vector<OutputShape> &output_shape() const;
+  void add_output_shape(const OutputShape &value);
+  const std::vector<DataType> &output_type() const;
+  void set_output_type(const std::vector<DataType> &value);
+
+ private:
+  void set_has_name();
+  void set_has_type();
+  void set_has_mem_id();
+
+ private:
+  std::string name_;
+  std::string type_;
+
+  std::vector<std::string> input_;
+  std::vector<std::string> output_;
+  std::vector<Argument> arg_;
+  std::vector<OutputShape> output_shape_;
+  std::vector<DataType> output_type_;
+
+  std::vector<int> mem_id_;
+
+  // nnlib
+  uint32_t node_id_;
+  uint32_t op_id_;
+  uint32_t padding_;
+  std::vector<NodeInput> node_input_;
+  std::vector<int> out_max_byte_size_;
+
+  uint32_t has_bits_;
+};
+
+class MemoryBlock {
+ public:
+  MemoryBlock(int mem_id, uint32_t x, uint32_t y);
+
+ public:
+  int mem_id() const;
+  uint32_t x() const;
+  uint32_t y() const;
+
+ private:
+  int mem_id_;
+  uint32_t x_;
+  uint32_t y_;
+};
+
+class MemoryArena {
+ public:
+  const std::vector<MemoryBlock> &mem_block() const;
+  std::vector<MemoryBlock> &mutable_mem_block();
+  int mem_block_size() const;
+
+ private:
+  std::vector<MemoryBlock> mem_block_;
+};
+
+// for hexagon mace-nnlib
+class InputInfo {
+ public:
+  const std::string &name() const;
+  int32_t node_id() const;
+  int32_t max_byte_size() const;
+  DataType data_type() const;
+  const std::vector<int32_t> &dims() const;
+
+ private:
+  std::string name_;
+  int32_t node_id_;
+  int32_t max_byte_size_;  // only support 32-bit len
+  DataType data_type_;
+  std::vector<int32_t> dims_;
+};
+
+class OutputInfo {
+ public:
+  const std::string &name() const;
+  int32_t node_id() const;
+  int32_t max_byte_size() const;
+  DataType data_type() const;
+  void set_data_type(DataType data_type);
+  const std::vector<int32_t> &dims() const;
+  void set_dims(const std::vector<int32_t> &dims);
+
+ private:
+  std::string name_;
+  int32_t node_id_;
+  int32_t max_byte_size_;  // only support 32-bit len
+  DataType data_type_;
+  std::vector<int32_t> dims_;
+};
+
+class NetDef {
+ public:
+  NetDef();
+  int op_size() const;
+
+  const OperatorDef &op(const int idx) const;
+
+ public:
+  const std::string &name() const;
+  bool has_name() const;
+  void set_name(const std::string &value);
+  const std::string &version() const;
+  bool has_version() const;
+  void set_version(const std::string &value);
+
+  const std::vector<OperatorDef> &op() const;
+  OperatorDef *add_op();
+  std::vector<OperatorDef> &mutable_op();
+  const std::vector<Argument> &arg() const;
+  Argument *add_arg();
+  std::vector<Argument> &mutable_arg();
+  const std::vector<ConstTensor> &tensors() const;
+  std::vector<ConstTensor> &mutable_tensors();
+  const MemoryArena &mem_arena() const;
+  bool has_mem_arena() const;
+  MemoryArena &mutable_mem_arena();
+  const std::vector<InputInfo> &input_info() const;
+  const std::vector<OutputInfo> &output_info() const;
+  std::vector<OutputInfo> &mutable_output_info();
+
+ private:
+  void set_has_name();
+  void set_has_version();
+  void set_has_mem_arena();
+
+ private:
+  std::string name_;
+  std::string version_;
+  std::vector<OperatorDef> op_;
+  std::vector<Argument> arg_;
+  std::vector<ConstTensor> tensors_;
+
+  // for mem optimization
+  MemoryArena mem_arena_;
+
+  // for hexagon mace-nnlib
+  std::vector<InputInfo> input_info_;
+  std::vector<OutputInfo> output_info_;
+
+  uint32_t has_bits_;
+};
+
+struct CallStats {
+  int64_t start_micros;
+  int64_t end_micros;
+};
+
+struct OperatorStats {
+  std::string operator_name;
+  std::string type;
+  CallStats stats;
+};
+
+struct RunMetadata {
+  std::vector<OperatorStats> op_stats;
+};
+
+}  // namespace mace
+
+#endif  // MACE_PUBLIC_MACE_TYPES_H_
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -72,9 +72,9 @@ class Shapes(object):
    output_shape = np.zeros_like(input_shape)
    output_shape[0] = input_shape[0]
    output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
-                       - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+                                      - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
    output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
-                       - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+                                      - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
    output_shape[3] = filter_shape[2]
    return output_shape

@@ -247,12 +247,9 @@ class CaffeConverter(object):
    arg.i = self.dt
    return output_name

-  def add_input_transform(self, names, is_single):
+  def add_input_transform(self, names):
    for name in names:
-      if is_single:
-        new_input_name = MACE_INPUT_NODE_NAME + ":0"
-      else:
-        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
      op_def = self.net_def.op.add()
      op_def.name = name
      op_def.type = 'BufferToImage'
@@ -267,12 +264,9 @@ class CaffeConverter(object):
      arg.name = 'T'
      arg.i = self.dt

-  def add_output_transform(self, names, is_single):
+  def add_output_transform(self, names):
    for name in names:
-      if is_single:
-        output_name = MACE_OUTPUT_NODE_NAME + ":0"
-      else:
-        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
      op_def = self.net_def.op.add()
      op_def.name = output_name[:-2]
      op_def.type = 'ImageToBuffer'
@@ -333,8 +327,18 @@ class CaffeConverter(object):
    return pad, stride, kernel

  def convert_conv2d(self, op):
-    op_def = self.CommonConvert(op, 'Conv2D')
    param = op.layer.convolution_param
+    is_depthwise = False
+    if param.HasField('group'):
+      if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
+        is_depthwise = True
+      else:
+        raise Exception("Mace do not support group convolution yet")
+
+    if is_depthwise:
+      op_def = self.CommonConvert(op, 'DepthwiseConv2d')
+    else:
+      op_def = self.CommonConvert(op, 'Conv2D')

    # Add filter
    weight_tensor_name = op.name + '_weight:0'
@@ -342,7 +346,7 @@ class CaffeConverter(object):
    self.add_tensor(weight_tensor_name, weight_data)

    if self.device == 'gpu':
-      buffer_type = "CONV2D_FILTER"
+      buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER"
      output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
      op_def.input.extend([output_name])
    else:
@@ -373,15 +377,16 @@ class CaffeConverter(object):
    self.resolved_ops.add(op.name)

    output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-                                          weight_data.shape,
-                                          paddings, strides, dilations,
-                                          math.floor)
+      weight_data.shape,
+      paddings, strides, dilations,
+      math.floor)
    op.output_shape_map[op.layer.top[0]] = output_shape

    if len(self.ops_map[final_op.name].children) == 1 \
        and self.ops_map[final_op.name].children[0].type in activation_name_map:
      activation_op = self.ops_map[final_op.name].children[0]
-      op_def.type = "FusedConv2D"
+      if not is_depthwise:
+        op_def.type = "FusedConv2D"
      fused_act_arg = op_def.arg.add()
      fused_act_arg.name = 'activation'
      fused_act_arg.s = activation_name_map[activation_op.type]
@@ -412,7 +417,7 @@ class CaffeConverter(object):
    width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
    return self.winograd and self.device == 'gpu' and \
           filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
-           dilations[0] == 1 and (dilations[0] == dilations[1]) and\
+           dilations[0] == 1 and (dilations[0] == dilations[1]) and \
           (strides[0] == 1) and (strides[0] == strides[1]) and \
           (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
           (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
@@ -662,7 +667,7 @@ class CaffeConverter(object):

    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
    output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
-                                          paddings, strides, [1, 1], math.ceil)
+      paddings, strides, [1, 1], math.ceil)
    op.output_shape_map[op.layer.top[0]] = output_shape

    op_def.output.extend([op.name + ':0'])
@@ -764,7 +769,7 @@ class CaffeConverter(object):
    input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
    num_outputs = len(op.layer.top)
    if (input_shape[3] % num_outputs) != 0 or \
-      (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
+        (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
      raise Exception('Mace do not support slice with input shape '
                      + str(input_shape) + ' and number of output ' + str(num_outputs))
    output_shape = Shapes.slice_shape(input_shape, num_outputs)
@@ -789,7 +794,6 @@ class CaffeConverter(object):
    input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
    output_shape = input_shape
    shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
-    print shape_param
    for i in range(len(shape_param)):
      if shape_param[i] != 0:
        output_shape[i] = shape_param[i]
@@ -844,29 +848,20 @@ class CaffeConverter(object):
    self.net_def.op.extend([op_def])
    self.resolved_ops.add(op.name)

-  def replace_in_out_name(self, input_names, output_names, is_single):
+  def replace_in_out_name(self, input_names, output_names):
    in_names = set([input_name + ":0" for input_name in input_names])
    out_names = set([output_name + ":0" for output_name in output_names])
-    if is_single:
-      for op in self.net_def.op:
-        for i in range(len(op.input)):
-          if op.input[i] in in_names:
-            op.input[i] = MACE_INPUT_NODE_NAME + ':0'
-        for i in range(len(op.output)):
-          if op.output[i] in out_names:
-            op.output[i] = MACE_OUTPUT_NODE_NAME + ':0'
-    else:
-      for op in self.net_def.op:
-        for i in range(len(op.input)):
-          if op.input[i] in in_names:
-            op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
-          if op.input[i] in out_names:
-            op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
-        for i in range(len(op.output)):
-          if op.output[i] in in_names:
-            op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
-          if op.output[i] in out_names:
-            op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
+    for op in self.net_def.op:
+      for i in range(len(op.input)):
+        if op.input[i] in in_names:
+          op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
+        if op.input[i] in out_names:
+          op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
+      for i in range(len(op.output)):
+        if op.output[i] in in_names:
+          op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
+        if op.output[i] in out_names:
+          op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]

  def add_input_op_shape(self, input_nodes, input_shapes):
    assert len(input_nodes) == len(input_shapes)
@@ -878,9 +873,8 @@ class CaffeConverter(object):
        input_op.output_shape_map[input_op.name] = input_shapes[i]

  def convert(self, input_nodes, input_shapes, output_nodes):
-    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
    if self.device == 'gpu':
-      self.add_input_transform(input_nodes, is_single)
+      self.add_input_transform(input_nodes)

    assert self.ops[0].type == 'Input'
    self.add_input_op_shape(input_nodes, input_shapes)
@@ -925,10 +919,10 @@ class CaffeConverter(object):
        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))

    if self.device == 'gpu':
-      self.add_output_transform(output_nodes, is_single)
+      self.add_output_transform(output_nodes)

    if self.device == 'cpu':
-      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+      self.replace_in_out_name(input_nodes, output_nodes)

    for op in self.ops:
      if op.name not in self.resolved_ops:
@@ -967,3 +961,4 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str,
    print "Memory optimization done."

  return net_def
+
--- a/mace/python/tools/model_header.jinja2
+++ b/mace/python/tools/model_header.jinja2
 //
 // Copyright (c) 2017 XiaoMi All rights reserved.
-// Generated by the mace converter.  DO NOT EDIT!
 //
+// Generated by the mace converter.  DO NOT EDIT!
+
+#ifndef MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
+#define MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_

 #include <string>

@@ -10,13 +13,16 @@
 namespace mace {
 namespace {{tag}} {

-extern const unsigned char *LoadModelData(const char *model_data_file);
+const unsigned char *LoadModelData(const char *model_data_file);

-extern void UnloadModelData(const unsigned char *model_data);
+void UnloadModelData(const unsigned char *model_data);

-extern NetDef CreateNet(const unsigned char *model_data);
+NetDef CreateNet(const unsigned char *model_data);

-extern const std::string ModelChecksum();
+const std::string ModelChecksum();

 }  // namespace {{ tag }}
 }  // namespace mace
+
+#endif  // MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
+
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -118,12 +118,9 @@ class TFConverter(object):
    arg.i = self.dt
    return output_name

-  def add_input_transform(self, names, is_single):
+  def add_input_transform(self, names):
    for name in names:
-      if is_single:
-        new_input_name = MACE_INPUT_NODE_NAME + ":0"
-      else:
-        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
      op_def = self.net_def.op.add()
      op_def.name = name
      op_def.type = 'BufferToImage'
@@ -138,12 +135,9 @@ class TFConverter(object):
      arg.name = 'T'
      arg.i = self.dt

-  def add_output_transform(self, names, is_single):
+  def add_output_transform(self, names):
    for name in names:
-      if is_single:
-        output_name = MACE_OUTPUT_NODE_NAME + ":0"
-      else:
-        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
      op_def = self.net_def.op.add()
      op_def.name = output_name[:-2]
      op_def.type = 'ImageToBuffer'
@@ -362,7 +356,8 @@ class TFConverter(object):
    if len(self.tf_graph.get(final_op.name, [])) == 1 \
        and self.tf_graph[final_op.name][0].type in activation_name_map:
      activation_op = self.tf_graph[final_op.name][0]
-      op_def.type = "FusedConv2D"
+      if op_def.type == "Conv2D":
+        op_def.type = "FusedConv2D"
      fused_act_arg = op_def.arg.add()
      fused_act_arg.name = 'activation'
      fused_act_arg.s = activation_name_map[activation_op.type]
@@ -805,26 +800,18 @@ class TFConverter(object):
    self.add_output_shape(op.outputs, op_def)
    self.resolved_ops[op.name] = 1

-  def replace_in_out_name(self, input_names, output_names, is_single):
+  def replace_in_out_name(self, input_names, output_names):
    in_names = set([input_name + ":0" for input_name in input_names])
    out_names = set([output_name + ":0" for output_name in output_names])
-    if is_single:
-      for op in self.net_def.op:
-        if len(op.input) > 0 and op.input[0] in in_names:
-          op.input[0] = MACE_INPUT_NODE_NAME + ':0'
-        if len(op.output) > 0 and op.output[0] in out_names:
-          op.output[0] = MACE_OUTPUT_NODE_NAME + ':0'
-    else:
-      for op in self.net_def.op:
-        if len(op.input) > 0 and op.input[0] in in_names:
-          op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
-        if len(op.output) > 0 and op.output[0] in out_names:
-          op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
+    for op in self.net_def.op:
+      if op.input[0] in in_names:
+        op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
+      if op.output[0] in out_names:
+        op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]

  def convert(self, input_nodes, output_nodes):
-    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
    if self.device == 'gpu':
-      self.add_input_transform(input_nodes, is_single)
+      self.add_input_transform(input_nodes)

    for op in self.tf_ops:
      if self.resolved_ops[op.name] == 1:
@@ -892,10 +879,10 @@ class TFConverter(object):
        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))

    if self.device == 'gpu':
-      self.add_output_transform(output_nodes, is_single)
+      self.add_output_transform(output_nodes)

    if self.device == 'cpu':
-      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+      self.replace_in_out_name(input_nodes, output_nodes)

    for key in self.resolved_ops:
      if self.resolved_ops[key] != 1:

--- a/mace/tools/git/gen_version_source.sh
+++ b/mace/tools/git/gen_version_source.sh
@@ -25,6 +25,6 @@ cat <<EOF > ${OUTPUT_FILENAME}
 // This is a generated file, DO NOT EDIT

 namespace mace {
-  const char *MaceGitVersion() { return "${GIT_VERSION}"; }
+  const char *MaceVersion() { return "${GIT_VERSION}"; }
 }  // namespace mace
 EOF
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
@@ -12,6 +12,7 @@
 #include <utility>

 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/string_util.h"


--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -76,26 +76,28 @@ def generate_random_input(target_soc, model_output_dir,
      target_soc, model_output_dir, int(generate_data_or_not))
  run_command(command)

-  input_name_list = []
  input_file_list = []
-  if isinstance(input_names, list):
-    input_name_list.extend(input_names)
-  else:
-    input_name_list.append(input_names)
  if isinstance(input_files, list):
    input_file_list.extend(input_files)
  else:
    input_file_list.append(input_files)
-  assert len(input_file_list) == len(input_name_list)
-  for i in range(len(input_file_list)):
-    if input_file_list[i] is not None:
-      dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
-      if input_file_list[i].startswith("http://") or \
-          input_file_list[i].startswith("https://"):
-        urllib.urlretrieve(input_file_list[i], dst_input_file)
-      else:
-        print 'Copy input data:', dst_input_file
-        shutil.copy(input_file_list[i], dst_input_file)
+  if len(input_file_list) != 0:
+    input_name_list = []
+    if isinstance(input_names, list):
+      input_name_list.extend(input_names)
+    else:
+      input_name_list.append(input_names)
+    if len(input_file_list) != len(input_name_list):
+      raise Exception('If input_files set, the input files should match the input names.')
+    for i in range(len(input_file_list)):
+      if input_file_list[i] is not None:
+        dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
+        if input_file_list[i].startswith("http://") or \
+            input_file_list[i].startswith("https://"):
+          urllib.urlretrieve(input_file_list[i], dst_input_file)
+        else:
+          print 'Copy input data:', dst_input_file
+          shutil.copy(input_file_list[i], dst_input_file)

 def generate_model_code():
  command = "bash tools/generate_model_code.sh"