diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD
index 811b420ed27351422bae9f0b35981846dbdc5da3..b715869984d414234af5961a21cf4867bdb2601d 100644
--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -8,6 +8,7 @@ package(
 cc_library(
     name = "generated_models",
     srcs = glob(["models/*/*.cc"]),
+    hdrs = glob(["models/*/*.h"]),
     linkstatic = 1,
     deps = [
         "//mace/core",
@@ -33,7 +34,6 @@ cc_library(
     linkstatic = 1,
 )
 
-
 cc_library(
     name = "generated_version",
     srcs = ["version/version.cc"],
diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h
index ab8e14b99bf9baa078dd37129a3a089d4fb6096f..296f66e50efdf38c2107f7b55b082fc400c7ee1a 100644
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -10,6 +10,7 @@
 #include <map>
 
 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 
 namespace mace {
 
diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index 3e64e2b1a60c7c423d8e063f8f70caeda2232b77..49066049676935e289053aa2f05d03e5dcc14e84 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -2,410 +2,89 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include "mace/public/mace.h"
+#include <memory>
+
 #include "mace/core/net.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/core/runtime/cpu/cpu_runtime.h"
 #include "mace/core/types.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 
-ConstTensor::ConstTensor(const std::string &name,
-                         const unsigned char *data,
-                         const std::vector<int64_t> &dims,
-                         const DataType data_type,
-                         uint32_t node_id)
-    : name_(name),
-      data_(data),
-      data_size_(std::accumulate(
-          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
-      dims_(dims.begin(), dims.end()),
-      data_type_(data_type),
-      node_id_(node_id) {}
-
-ConstTensor::ConstTensor(const std::string &name,
-                         const unsigned char *data,
-                         const std::vector<int64_t> &dims,
-                         const int data_type,
-                         uint32_t node_id)
-    : name_(name),
-      data_(data),
-      data_size_(std::accumulate(
-          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
-      dims_(dims.begin(), dims.end()),
-      data_type_(static_cast<DataType>(data_type)),
-      node_id_(node_id) {}
+// Mace Tensor
+struct MaceTensor::Impl {
+  std::vector<int64_t> shape;
+  std::shared_ptr<float> data;
+};
 
-const std::string &ConstTensor::name() const { return name_; }
-const unsigned char *ConstTensor::data() const { return data_; }
-int64_t ConstTensor::data_size() const { return data_size_; }
-const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
-DataType ConstTensor::data_type() const { return data_type_; }
-uint32_t ConstTensor::node_id() const { return node_id_; }
-
-Argument::Argument() : has_bits_(0) {}
-
-void Argument::CopyFrom(const Argument &from) {
-  this->name_ = from.name();
-  this->f_ = from.f();
-  this->i_ = from.i();
-  this->s_ = from.s();
-  auto floats = from.floats();
-  this->floats_.resize(floats.size());
-  std::copy(floats.begin(), floats.end(), this->floats_.begin());
-  auto ints = from.ints();
-  this->ints_.resize(ints.size());
-  std::copy(ints.begin(), ints.end(), this->ints_.begin());
-  auto strings = from.floats();
-  this->strings_.resize(strings.size());
-  std::copy(floats.begin(), floats.end(), this->floats_.begin());
-
-  this->has_bits_ = from.has_bits_;
-}
-const std::string &Argument::name() const { return name_; }
-void Argument::set_name(const std::string &value) { name_ = value; }
-bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
-void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
-float Argument::f() const { return f_; }
-void Argument::set_f(float value) {
-  set_has_f();
-  f_ = value;
-}
-bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
-void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
-int64_t Argument::i() const { return i_; }
-void Argument::set_i(int64_t value) {
-  set_has_i();
-  i_ = value;
-}
-bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
-void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
-std::string Argument::s() const { return s_; }
-void Argument::set_s(const std::string &value) {
-  set_has_s();
-  s_ = value;
-}
-const std::vector<float> &Argument::floats() const { return floats_; }
-void Argument::add_floats(float value) { floats_.push_back(value); }
-void Argument::set_floats(const std::vector<float> &value) {
-  floats_.resize(value.size());
-  std::copy(value.begin(), value.end(), floats_.begin());
-}
-const std::vector<int64_t> &Argument::ints() const { return ints_; }
-void Argument::add_ints(int64_t value) { ints_.push_back(value); }
-void Argument::set_ints(const std::vector<int64_t> &value) {
-  ints_.resize(value.size());
-  std::copy(value.begin(), value.end(), ints_.begin());
-}
-const std::vector<std::string> &Argument::strings() const { return strings_; }
-void Argument::add_strings(const ::std::string &value) {
-  strings_.push_back(value);
-}
-void Argument::set_strings(const std::vector<std::string> &value) {
-  strings_.resize(value.size());
-  std::copy(value.begin(), value.end(), strings_.begin());
+MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
+                       std::shared_ptr<float> data) {
+  MACE_CHECK_NOTNULL(data.get());
+  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_->shape = shape;
+  impl_->data = data;
 }
 
-// Node Input
-NodeInput::NodeInput(int node_id, int output_port)
-    : node_id_(node_id), output_port_(output_port) {}
-void NodeInput::CopyFrom(const NodeInput &from) {
-  node_id_ = from.node_id();
-  output_port_ = from.output_port();
+MaceTensor::MaceTensor() {
+  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
 }
-int NodeInput::node_id() const { return node_id_; }
-void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
-int NodeInput::output_port() const { return output_port_; }
-void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
 
-// OutputShape
-OutputShape::OutputShape() {}
-OutputShape::OutputShape(const std::vector<int64_t> &dims)
-    : dims_(dims.begin(), dims.end()) {}
-void OutputShape::CopyFrom(const OutputShape &from) {
-  auto from_dims = from.dims();
-  dims_.resize(from_dims.size());
-  std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
+MaceTensor::MaceTensor(const MaceTensor &other) {
+  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_->shape = other.shape();
+  impl_->data = other.data();
 }
-const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
-
-// Operator Def
-void OperatorDef::CopyFrom(const OperatorDef &from) {
-  name_ = from.name();
-  type_ = from.type();
 
-  auto from_input = from.input();
-  input_.resize(from_input.size());
-  std::copy(from_input.begin(), from_input.end(), input_.begin());
-  auto from_output = from.output();
-  output_.resize(from_output.size());
-  std::copy(from_output.begin(), from_output.end(), output_.begin());
-  auto from_arg = from.arg();
-  arg_.resize(from_arg.size());
-  for (int i = 0; i < from_arg.size(); ++i) {
-    arg_[i].CopyFrom(from_arg[i]);
-  }
-  auto from_output_shape = from.output_shape();
-  output_shape_.resize(from_output_shape.size());
-  for (int i = 0; i < from_output_shape.size(); ++i) {
-    output_shape_[i].CopyFrom(from_output_shape[i]);
-  }
-  auto from_data_type = from.output_type();
-  output_type_.resize(from_data_type.size());
-  std::copy(from_data_type.begin(), from_data_type.end(), output_type_.begin());
-
-  auto mem_ids = from.mem_id();
-  mem_id_.resize(mem_ids.size());
-  std::copy(mem_ids.begin(), mem_ids.end(), mem_id_.begin());
-
-  // nnlib
-  node_id_ = from.node_id();
-  op_id_ = from.op_id();
-  padding_ = from.padding();
-  auto from_node_input = from.node_input();
-  node_input_.resize(from_node_input.size());
-  for (int i = 0; i < from_node_input.size(); ++i) {
-    node_input_[i].CopyFrom(from_node_input[i]);
-  }
-  auto from_out_max_byte_size = from.out_max_byte_size();
-  out_max_byte_size_.resize(from_out_max_byte_size.size());
-  std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
-            out_max_byte_size_.begin());
-
-  has_bits_ = from.has_bits_;
+MaceTensor::MaceTensor(const MaceTensor &&other) {
+  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_->shape = std::move(other.shape());
+  impl_->data = std::move(other.data());
 }
 
-const std::string &OperatorDef::name() const { return name_; }
-void OperatorDef::set_name(const std::string &name_) {
-  set_has_name();
-  OperatorDef::name_ = name_;
-}
-bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
-void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
-const std::string &OperatorDef::type() const { return type_; }
-void OperatorDef::set_type(const std::string &type_) {
-  set_has_type();
-  OperatorDef::type_ = type_;
-}
-bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
-void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
-const std::vector<int> &OperatorDef::mem_id() const { return mem_id_; }
-void OperatorDef::set_mem_id(const std::vector<int> &value) {
-  mem_id_.resize(value.size());
-  std::copy(value.begin(), value.end(), mem_id_.begin());
-}
-uint32_t OperatorDef::node_id() const { return node_id_; }
-void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
-uint32_t OperatorDef::op_id() const { return op_id_; }
-uint32_t OperatorDef::padding() const { return padding_; }
-void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
-const std::vector<NodeInput> &OperatorDef::node_input() const {
-  return node_input_;
-}
-void OperatorDef::add_node_input(const NodeInput &value) {
-  node_input_.push_back(value);
-}
-const std::vector<int> &OperatorDef::out_max_byte_size() const {
-  return out_max_byte_size_;
-}
-void OperatorDef::add_out_max_byte_size(int value) {
-  out_max_byte_size_.push_back(value);
-}
-const std::vector<std::string> &OperatorDef::input() const { return input_; }
-const std::string &OperatorDef::input(int index) const {
-  MACE_CHECK(0 <= index && index <= input_.size());
-  return input_[index];
-}
-std::string *OperatorDef::add_input() {
-  input_.push_back("");
-  return &input_.back();
+MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
+  impl_->shape = other.shape();
+  impl_->data = other.data();
+  return *this;
 }
-void OperatorDef::add_input(const ::std::string &value) {
-  input_.push_back(value);
-}
-void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
-void OperatorDef::set_input(const std::vector<std::string> &value) {
-  input_.resize(value.size());
-  std::copy(value.begin(), value.end(), input_.begin());
-}
-const std::vector<std::string> &OperatorDef::output() const { return output_; }
-const std::string &OperatorDef::output(int index) const {
-  MACE_CHECK(0 <= index && index <= output_.size());
-  return output_[index];
-}
-std::string *OperatorDef::add_output() {
-  output_.push_back("");
-  return &output_.back();
-}
-void OperatorDef::add_output(const ::std::string &value) {
-  output_.push_back(value);
-}
-void OperatorDef::add_output(::std::string &&value) {
-  output_.push_back(value);
-}
-void OperatorDef::set_output(const std::vector<std::string> &value) {
-  output_.resize(value.size());
-  std::copy(value.begin(), value.end(), output_.begin());
-}
-const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
-Argument *OperatorDef::add_arg() {
-  arg_.emplace_back(Argument());
-  return &arg_.back();
-}
-const std::vector<OutputShape> &OperatorDef::output_shape() const {
-  return output_shape_;
-}
-void OperatorDef::add_output_shape(const OutputShape &value) {
-  output_shape_.push_back(value);
-}
-const std::vector<DataType> &OperatorDef::output_type() const {
-  return output_type_;
-}
-void OperatorDef::set_output_type(const std::vector<DataType> &value) {
-  output_type_.resize(value.size());
-  std::copy(value.begin(), value.end(), output_type_.begin());
-}
-
-// MemoryBlock
-MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
-    : mem_id_(mem_id), x_(x), y_(y) {}
-
-int MemoryBlock::mem_id() const { return mem_id_; }
-uint32_t MemoryBlock::x() const { return x_; }
-uint32_t MemoryBlock::y() const { return y_; }
 
-// MemoryArena
-const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
-  return mem_block_;
+MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
+  impl_->shape = std::move(other.shape());
+  impl_->data = std::move(other.data());
+  return *this;
 }
-std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
-  return mem_block_;
-}
-int MemoryArena::mem_block_size() const { return mem_block_.size(); }
-
-// InputInfo
-const std::string &InputInfo::name() const { return name_; }
-int32_t InputInfo::node_id() const { return node_id_; }
-int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
-DataType InputInfo::data_type() const { return data_type_; }
-const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
-
-// OutputInfo
-const std::string &OutputInfo::name() const { return name_; }
-int32_t OutputInfo::node_id() const { return node_id_; }
-int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
-DataType OutputInfo::data_type() const { return data_type_; }
-void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
-const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
-void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
-
-// NetDef
-NetDef::NetDef() : has_bits_(0) {}
-
-const std::string &NetDef::name() const { return name_; }
-void NetDef::set_name(const std::string &value) {
-  set_has_name();
-  name_ = value;
-}
-bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
-void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
-const std::string &NetDef::version() const { return version_; }
-void NetDef::set_version(const std::string &value) {
-  set_has_version();
-  version_ = value;
-}
-bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
-void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
-const std::vector<OperatorDef> &NetDef::op() const { return op_; }
-OperatorDef *NetDef::add_op() {
-  op_.emplace_back(OperatorDef());
-  return &op_.back();
-}
-std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
-const std::vector<Argument> &NetDef::arg() const { return arg_; }
-Argument *NetDef::add_arg() {
-  arg_.emplace_back(Argument());
-  return &arg_.back();
-}
-std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
-const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
-std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
-const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
-MemoryArena &NetDef::mutable_mem_arena() {
-  set_has_mem_arena();
-  return mem_arena_;
-}
-bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
-void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
-const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
-const std::vector<OutputInfo> &NetDef::output_info() const {
-  return output_info_;
-}
-std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
 
-int NetDef::op_size() const { return op_.size(); }
+MaceTensor::~MaceTensor() = default;
 
-const OperatorDef &NetDef::op(const int idx) const {
-  MACE_CHECK(0 <= idx && idx < op_size());
-  return op_[idx];
-}
+const std::vector<int64_t> &MaceTensor::shape() const { return impl_->shape; }
 
-void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
-                         GPUPriorityHint gpu_priority_hint) {
-  VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
-          << ", gpu_priority_hint: " << gpu_priority_hint;
-  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
-}
+const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
 
-void ConfigOmpThreadsAndAffinity(int omp_num_threads,
-                                 CPUPowerOption power_option) {
-  VLOG(1) << "Config CPU Runtime: omp_num_threads: " << omp_num_threads
-          << ", cpu_power_option: " << power_option;
-  SetOmpThreadsAndAffinity(omp_num_threads, power_option);
-}
+std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 
 // Mace Engine
-MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type)
-    : op_registry_(new OperatorRegistry()),
-      device_type_(device_type),
-      ws_(new Workspace()),
-      net_(nullptr),
-      hexagon_controller_(nullptr) {
-  ws_->CreateTensor("mace_input_node:0", GetDeviceAllocator(device_type_),
-                    DT_FLOAT);
-  ws_->CreateTensor("mace_output_node:0", GetDeviceAllocator(device_type_),
-                    DT_FLOAT);
-  if (device_type == HEXAGON) {
-    hexagon_controller_.reset(new HexagonControlWrapper());
-    MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
-    MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
-    hexagon_controller_->SetDebugLevel(
-        static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
-    int dsp_mode =
-        ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
-    hexagon_controller_->SetGraphMode(dsp_mode);
-    MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
-               "hexagon setup graph error");
-    if (VLOG_IS_ON(2)) {
-      hexagon_controller_->PrintGraph();
-    }
-  } else {
-    ws_->LoadModelTensor(*net_def, device_type);
-
-    // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
-                         NetMode::INIT);
-    if (!net->Run()) {
-      LOG(FATAL) << "Net init run failed";
-    }
-    net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
-  }
-}
-
-extern const char *MaceGitVersion();
-
-MaceEngine::MaceEngine(const NetDef *net_def,
+class MaceEngine::Impl {
+ public:
+  explicit Impl(const NetDef *net_def,
+                DeviceType device_type,
+                const std::vector<std::string> &input_nodes,
+                const std::vector<std::string> &output_nodes);
+  ~Impl();
+
+  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
+                 std::map<std::string, MaceTensor> *outputs,
+                 RunMetadata *run_metadata);
+
+ private:
+  std::shared_ptr<OperatorRegistry> op_registry_;
+  DeviceType device_type_;
+  std::unique_ptr<Workspace> ws_;
+  std::unique_ptr<NetBase> net_;
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
+
+  DISABLE_COPY_AND_ASSIGN(Impl);
+};
+
+MaceEngine::Impl::Impl(const NetDef *net_def,
                        DeviceType device_type,
                        const std::vector<std::string> &input_nodes,
                        const std::vector<std::string> &output_nodes)
@@ -414,7 +93,7 @@ MaceEngine::MaceEngine(const NetDef *net_def,
       ws_(new Workspace()),
       net_(nullptr),
       hexagon_controller_(nullptr) {
-  LOG(INFO) << "MACE GIT VERSION: " << MaceGitVersion();
+  LOG(INFO) << "MACE version: " << MaceVersion();
   for (auto input_name : input_nodes) {
     ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
                       GetDeviceAllocator(device_type_), DT_FLOAT);
@@ -449,7 +128,8 @@ MaceEngine::MaceEngine(const NetDef *net_def,
     net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
   }
 }
-MaceEngine::~MaceEngine() {
+
+MaceEngine::Impl::~Impl() {
   if (device_type_ == HEXAGON) {
     if (VLOG_IS_ON(2)) {
       hexagon_controller_->GetPerfInfo();
@@ -460,81 +140,69 @@ MaceEngine::~MaceEngine() {
   }
 }
 
-bool MaceEngine::Run(const float *input,
-                     const std::vector<index_t> &input_shape,
-                     float *output) {
-  return Run(input, input_shape, output, nullptr);
-}
-
-bool MaceEngine::Run(const float *input,
-                     const std::vector<index_t> &input_shape,
-                     float *output,
-                     RunMetadata *run_metadata) {
-  MACE_CHECK(output != nullptr, "output ptr cannot be NULL");
-  Tensor *input_tensor = ws_->GetTensor("mace_input_node:0");
-  Tensor *output_tensor = ws_->GetTensor("mace_output_node:0");
-  input_tensor->Resize(input_shape);
-  {
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    memcpy(input_data, input, input_tensor->size() * sizeof(float));
-  }
-  if (device_type_ == HEXAGON) {
-    hexagon_controller_->ExecuteGraph(*input_tensor, output_tensor);
-  } else {
-    if (!net_->Run(run_metadata)) {
-      LOG(FATAL) << "Net run failed";
-    }
-  }
-  // save output
-  if (output_tensor != nullptr) {
-    Tensor::MappingGuard output_guard(output_tensor);
-    auto shape = output_tensor->shape();
-    int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                          std::multiplies<int64_t>());
-    std::memcpy(output, output_tensor->data<float>(),
-                output_size * sizeof(float));
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
-                     std::map<std::string, float *> &outputs,
-                     RunMetadata *run_metadata) {
-  MACE_CHECK(device_type_ != HEXAGON,
+MaceStatus MaceEngine::Impl::Run(
+    const std::map<std::string, MaceTensor> &inputs,
+    std::map<std::string, MaceTensor> *outputs,
+    RunMetadata *run_metadata) {
+  MACE_CHECK_NOTNULL(outputs);
+  MACE_CHECK(device_type_ != HEXAGON || outputs->size() <= 1,
              "HEXAGON not supports multiple outputs now");
-  for (auto input : inputs) {
+  for (auto &input : inputs) {
     Tensor *input_tensor =
-        ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
-    input_tensor->Resize(input.shape);
+        ws_->GetTensor(MakeString("mace_input_node_", input.first, ":0"));
+    input_tensor->Resize(input.second.shape());
     {
       Tensor::MappingGuard input_guard(input_tensor);
       float *input_data = input_tensor->mutable_data<float>();
-      memcpy(input_data, input.data, input_tensor->size() * sizeof(float));
+      memcpy(input_data, input.second.data().get(),
+             input_tensor->size() * sizeof(float));
     }
   }
   if (!net_->Run(run_metadata)) {
     LOG(FATAL) << "Net run failed";
   }
-  for (auto output : outputs) {
+  for (auto &output : *outputs) {
     Tensor *output_tensor =
         ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
     // save output
-    if (output_tensor != nullptr && output.second != nullptr) {
+    if (output_tensor != nullptr && output.second.data() != nullptr) {
       Tensor::MappingGuard output_guard(output_tensor);
       auto shape = output_tensor->shape();
       int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
                                             std::multiplies<int64_t>());
       MACE_CHECK(!shape.empty()) << "Output's shape must greater than 0";
-      std::memcpy(output.second, output_tensor->data<float>(),
+      MACE_CHECK(shape == output.second.shape())
+          << "Output shape mispatch: "
+          << MakeString<int64_t>(output.second.shape())
+          << " != " << MakeString<int64_t>(shape);
+      std::memcpy(output.second.data().get(), output_tensor->data<float>(),
                   output_size * sizeof(float));
     } else {
-      return false;
+      return MACE_INVALID_ARGS;
     }
   }
-  return true;
+  return MACE_SUCCESS;
+}
+
+MaceEngine::MaceEngine(const NetDef *net_def,
+                       DeviceType device_type,
+                       const std::vector<std::string> &input_nodes,
+                       const std::vector<std::string> &output_nodes) {
+  impl_ = std::unique_ptr<MaceEngine::Impl>(
+      new MaceEngine::Impl(net_def, device_type, input_nodes, output_nodes));
+}
+
+MaceEngine::~MaceEngine() = default;
+
+MaceStatus MaceEngine::Run(const std::map<std::string, MaceTensor> &inputs,
+                           std::map<std::string, MaceTensor> *outputs,
+                           RunMetadata *run_metadata) {
+  return impl_->Run(inputs, outputs, run_metadata);
+}
+
+MaceStatus MaceEngine::Run(const std::map<std::string, MaceTensor> &inputs,
+                           std::map<std::string, MaceTensor> *outputs) {
+  return impl_->Run(inputs, outputs, nullptr);
 }
 
 }  // namespace mace
diff --git a/mace/core/mace_runtime.cc b/mace/core/mace_runtime.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6d08184d9ce88c9095ef078862e6f8999d99c0c
--- /dev/null
+++ b/mace/core/mace_runtime.cc
@@ -0,0 +1,25 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/public/mace_runtime.h"
+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+
+namespace mace {
+
+void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
+                         GPUPriorityHint gpu_priority_hint) {
+  VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
+          << ", gpu_priority_hint: " << gpu_priority_hint;
+  OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
+}
+
+void ConfigOmpThreadsAndAffinity(int omp_num_threads,
+                                 CPUPowerOption power_option) {
+  VLOG(1) << "Config CPU Runtime: omp_num_threads: " << omp_num_threads
+          << ", cpu_power_option: " << power_option;
+  SetOmpThreadsAndAffinity(omp_num_threads, power_option);
+}
+
+};  // namespace mace
diff --git a/mace/core/mace_types.cc b/mace/core/mace_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04f968e6a8a9066c4d76fcf862eb05b40a7746cf
--- /dev/null
+++ b/mace/core/mace_types.cc
@@ -0,0 +1,353 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <memory>
+#include <numeric>
+
+#include "mace/public/mace_types.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+ConstTensor::ConstTensor(const std::string &name,
+                         const unsigned char *data,
+                         const std::vector<int64_t> &dims,
+                         const DataType data_type,
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(data_type),
+      node_id_(node_id) {}
+
+ConstTensor::ConstTensor(const std::string &name,
+                         const unsigned char *data,
+                         const std::vector<int64_t> &dims,
+                         const int data_type,
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(static_cast<DataType>(data_type)),
+      node_id_(node_id) {}
+
+const std::string &ConstTensor::name() const { return name_; }
+const unsigned char *ConstTensor::data() const { return data_; }
+int64_t ConstTensor::data_size() const { return data_size_; }
+const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
+DataType ConstTensor::data_type() const { return data_type_; }
+uint32_t ConstTensor::node_id() const { return node_id_; }
+
+Argument::Argument() : has_bits_(0) {}
+
+void Argument::CopyFrom(const Argument &from) {
+  this->name_ = from.name();
+  this->f_ = from.f();
+  this->i_ = from.i();
+  this->s_ = from.s();
+  auto floats = from.floats();
+  this->floats_.resize(floats.size());
+  std::copy(floats.begin(), floats.end(), this->floats_.begin());
+  auto ints = from.ints();
+  this->ints_.resize(ints.size());
+  std::copy(ints.begin(), ints.end(), this->ints_.begin());
+  auto strings = from.floats();
+  this->strings_.resize(strings.size());
+  std::copy(floats.begin(), floats.end(), this->floats_.begin());
+
+  this->has_bits_ = from.has_bits_;
+}
+const std::string &Argument::name() const { return name_; }
+void Argument::set_name(const std::string &value) { name_ = value; }
+bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
+void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
+float Argument::f() const { return f_; }
+void Argument::set_f(float value) {
+  set_has_f();
+  f_ = value;
+}
+bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
+void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
+int64_t Argument::i() const { return i_; }
+void Argument::set_i(int64_t value) {
+  set_has_i();
+  i_ = value;
+}
+bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
+void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
+std::string Argument::s() const { return s_; }
+void Argument::set_s(const std::string &value) {
+  set_has_s();
+  s_ = value;
+}
+const std::vector<float> &Argument::floats() const { return floats_; }
+void Argument::add_floats(float value) { floats_.push_back(value); }
+void Argument::set_floats(const std::vector<float> &value) {
+  floats_.resize(value.size());
+  std::copy(value.begin(), value.end(), floats_.begin());
+}
+const std::vector<int64_t> &Argument::ints() const { return ints_; }
+void Argument::add_ints(int64_t value) { ints_.push_back(value); }
+void Argument::set_ints(const std::vector<int64_t> &value) {
+  ints_.resize(value.size());
+  std::copy(value.begin(), value.end(), ints_.begin());
+}
+const std::vector<std::string> &Argument::strings() const { return strings_; }
+void Argument::add_strings(const ::std::string &value) {
+  strings_.push_back(value);
+}
+void Argument::set_strings(const std::vector<std::string> &value) {
+  strings_.resize(value.size());
+  std::copy(value.begin(), value.end(), strings_.begin());
+}
+
+// Node Input
+NodeInput::NodeInput(int node_id, int output_port)
+    : node_id_(node_id), output_port_(output_port) {}
+void NodeInput::CopyFrom(const NodeInput &from) {
+  node_id_ = from.node_id();
+  output_port_ = from.output_port();
+}
+int NodeInput::node_id() const { return node_id_; }
+void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
+int NodeInput::output_port() const { return output_port_; }
+void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
+
+// OutputShape
+OutputShape::OutputShape() {}
+OutputShape::OutputShape(const std::vector<int64_t> &dims)
+    : dims_(dims.begin(), dims.end()) {}
+void OutputShape::CopyFrom(const OutputShape &from) {
+  auto from_dims = from.dims();
+  dims_.resize(from_dims.size());
+  std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
+}
+const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
+
+// Operator Def
+void OperatorDef::CopyFrom(const OperatorDef &from) {
+  name_ = from.name();
+  type_ = from.type();
+
+  auto from_input = from.input();
+  input_.resize(from_input.size());
+  std::copy(from_input.begin(), from_input.end(), input_.begin());
+  auto from_output = from.output();
+  output_.resize(from_output.size());
+  std::copy(from_output.begin(), from_output.end(), output_.begin());
+  auto from_arg = from.arg();
+  arg_.resize(from_arg.size());
+  for (int i = 0; i < from_arg.size(); ++i) {
+    arg_[i].CopyFrom(from_arg[i]);
+  }
+  auto from_output_shape = from.output_shape();
+  output_shape_.resize(from_output_shape.size());
+  for (int i = 0; i < from_output_shape.size(); ++i) {
+    output_shape_[i].CopyFrom(from_output_shape[i]);
+  }
+  auto from_data_type = from.output_type();
+  output_type_.resize(from_data_type.size());
+  std::copy(from_data_type.begin(), from_data_type.end(), output_type_.begin());
+
+  auto mem_ids = from.mem_id();
+  mem_id_.resize(mem_ids.size());
+  std::copy(mem_ids.begin(), mem_ids.end(), mem_id_.begin());
+
+  // nnlib
+  node_id_ = from.node_id();
+  op_id_ = from.op_id();
+  padding_ = from.padding();
+  auto from_node_input = from.node_input();
+  node_input_.resize(from_node_input.size());
+  for (int i = 0; i < from_node_input.size(); ++i) {
+    node_input_[i].CopyFrom(from_node_input[i]);
+  }
+  auto from_out_max_byte_size = from.out_max_byte_size();
+  out_max_byte_size_.resize(from_out_max_byte_size.size());
+  std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
+            out_max_byte_size_.begin());
+
+  has_bits_ = from.has_bits_;
+}
+
+const std::string &OperatorDef::name() const { return name_; }
+void OperatorDef::set_name(const std::string &name_) {
+  set_has_name();
+  OperatorDef::name_ = name_;
+}
+bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &OperatorDef::type() const { return type_; }
+void OperatorDef::set_type(const std::string &type_) {
+  set_has_type();
+  OperatorDef::type_ = type_;
+}
+bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
+void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
+const std::vector<int> &OperatorDef::mem_id() const { return mem_id_; }
+void OperatorDef::set_mem_id(const std::vector<int> &value) {
+  mem_id_.resize(value.size());
+  std::copy(value.begin(), value.end(), mem_id_.begin());
+}
+uint32_t OperatorDef::node_id() const { return node_id_; }
+void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
+uint32_t OperatorDef::op_id() const { return op_id_; }
+uint32_t OperatorDef::padding() const { return padding_; }
+void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
+const std::vector<NodeInput> &OperatorDef::node_input() const {
+  return node_input_;
+}
+void OperatorDef::add_node_input(const NodeInput &value) {
+  node_input_.push_back(value);
+}
+const std::vector<int> &OperatorDef::out_max_byte_size() const {
+  return out_max_byte_size_;
+}
+void OperatorDef::add_out_max_byte_size(int value) {
+  out_max_byte_size_.push_back(value);
+}
+const std::vector<std::string> &OperatorDef::input() const { return input_; }
+const std::string &OperatorDef::input(int index) const {
+  MACE_CHECK(0 <= index && index <= input_.size());
+  return input_[index];
+}
+std::string *OperatorDef::add_input() {
+  input_.push_back("");
+  return &input_.back();
+}
+void OperatorDef::add_input(const ::std::string &value) {
+  input_.push_back(value);
+}
+void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
+void OperatorDef::set_input(const std::vector<std::string> &value) {
+  input_.resize(value.size());
+  std::copy(value.begin(), value.end(), input_.begin());
+}
+const std::vector<std::string> &OperatorDef::output() const { return output_; }
+const std::string &OperatorDef::output(int index) const {
+  MACE_CHECK(0 <= index && index <= output_.size());
+  return output_[index];
+}
+std::string *OperatorDef::add_output() {
+  output_.push_back("");
+  return &output_.back();
+}
+void OperatorDef::add_output(const ::std::string &value) {
+  output_.push_back(value);
+}
+void OperatorDef::add_output(::std::string &&value) {
+  output_.push_back(value);
+}
+void OperatorDef::set_output(const std::vector<std::string> &value) {
+  output_.resize(value.size());
+  std::copy(value.begin(), value.end(), output_.begin());
+}
+const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
+Argument *OperatorDef::add_arg() {
+  arg_.emplace_back(Argument());
+  return &arg_.back();
+}
+const std::vector<OutputShape> &OperatorDef::output_shape() const {
+  return output_shape_;
+}
+void OperatorDef::add_output_shape(const OutputShape &value) {
+  output_shape_.push_back(value);
+}
+const std::vector<DataType> &OperatorDef::output_type() const {
+  return output_type_;
+}
+void OperatorDef::set_output_type(const std::vector<DataType> &value) {
+  output_type_.resize(value.size());
+  std::copy(value.begin(), value.end(), output_type_.begin());
+}
+
+// MemoryBlock
+MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
+    : mem_id_(mem_id), x_(x), y_(y) {}
+
+int MemoryBlock::mem_id() const { return mem_id_; }
+uint32_t MemoryBlock::x() const { return x_; }
+uint32_t MemoryBlock::y() const { return y_; }
+
+// MemoryArena
+const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
+  return mem_block_;
+}
+std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
+  return mem_block_;
+}
+int MemoryArena::mem_block_size() const { return mem_block_.size(); }
+
+// InputInfo
+const std::string &InputInfo::name() const { return name_; }
+int32_t InputInfo::node_id() const { return node_id_; }
+int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
+DataType InputInfo::data_type() const { return data_type_; }
+const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
+
+// OutputInfo
+const std::string &OutputInfo::name() const { return name_; }
+int32_t OutputInfo::node_id() const { return node_id_; }
+int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
+DataType OutputInfo::data_type() const { return data_type_; }
+void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
+const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
+void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
+
+// NetDef
+NetDef::NetDef() : has_bits_(0) {}
+
+const std::string &NetDef::name() const { return name_; }
+void NetDef::set_name(const std::string &value) {
+  set_has_name();
+  name_ = value;
+}
+bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &NetDef::version() const { return version_; }
+void NetDef::set_version(const std::string &value) {
+  set_has_version();
+  version_ = value;
+}
+bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
+void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
+const std::vector<OperatorDef> &NetDef::op() const { return op_; }
+OperatorDef *NetDef::add_op() {
+  op_.emplace_back(OperatorDef());
+  return &op_.back();
+}
+std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
+const std::vector<Argument> &NetDef::arg() const { return arg_; }
+Argument *NetDef::add_arg() {
+  arg_.emplace_back(Argument());
+  return &arg_.back();
+}
+std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
+const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
+std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
+const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
+MemoryArena &NetDef::mutable_mem_arena() {
+  set_has_mem_arena();
+  return mem_arena_;
+}
+bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
+void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
+const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
+const std::vector<OutputInfo> &NetDef::output_info() const {
+  return output_info_;
+}
+std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
+
+int NetDef::op_size() const { return op_.size(); }
+
+const OperatorDef &NetDef::op(const int idx) const {
+  MACE_CHECK(0 <= idx && idx < op_size());
+  return op_[idx];
+}
+
+};  // namespace mace
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 27afdadd3a3b4c8bfbbe613c36ca558025f7c606..3ca7cd167b1e13d46e6072cea74f19d87df88ced 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -16,6 +16,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 
 namespace mace {
 
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 082673a6a5ea1931a5569b2cd6ecbd4868d97492..dbe19c4d6abe91e896df8de6dab251659b984a06 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -6,7 +6,7 @@
 #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 
-#include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
 
 namespace mace {
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 8a3ce06abb63f078efa89ca43b242e46a13e5a3e..414fa7ed91fa205cbbb5b3d3b06d6d7c91d59fcf 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -14,6 +14,7 @@
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
+#include "mace/public/mace_runtime.h"
 #include "mace/utils/timer.h"
 
 namespace mace {
diff --git a/mace/core/types.h b/mace/core/types.h
index e7a078f625fbaf869cdfbae50dcaf0be7b3b9054..f038d5be034a3e9140bea2cbcdcde174c9f9918b 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -8,7 +8,7 @@
 #include <cstdint>
 #include <string>
 
-#include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 #include "include/half.hpp"
 
 namespace mace {
diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc
index 1bfb79273821884462fc529909521c3e42e590d6..a05782baf44a7caac0c95ec0542083670f7622e8 100644
--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -23,9 +23,11 @@
 
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
 
+// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
 namespace mace {
 namespace MACE_MODEL_TAG {
 
@@ -145,19 +147,26 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
   return curr;
 }
 
-DEFINE_string(input_node, "input_node0,input_node1",
+DEFINE_string(input_node,
+              "input_node0,input_node1",
               "input nodes, separated by comma");
-DEFINE_string(input_shape, "1,224,224,3:1,1,1,10",
+DEFINE_string(input_shape,
+              "1,224,224,3:1,1,1,10",
               "input shapes, separated by colon and comma");
-DEFINE_string(output_node, "output_node0,output_node1",
+DEFINE_string(output_node,
+              "output_node0,output_node1",
               "output nodes, separated by comma");
-DEFINE_string(output_shape, "1,224,224,2:1,1,1,10",
+DEFINE_string(output_shape,
+              "1,224,224,2:1,1,1,10",
               "output shapes, separated by colon and comma");
-DEFINE_string(input_file, "",
+DEFINE_string(input_file,
+              "",
               "input file name | input file prefix for multiple inputs.");
-DEFINE_string(output_file, "",
+DEFINE_string(output_file,
+              "",
               "output file name | output file prefix for multiple outputs");
-DEFINE_string(model_data_file, "",
+DEFINE_string(model_data_file,
+              "",
               "model data file name, used when EMBED_MODEL_DATA set to 0");
 DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
 DEFINE_int32(round, 1, "round");
@@ -166,115 +175,14 @@ DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
 DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, 8, "num of openmp threads");
-DEFINE_int32(cpu_power_option, 0,
+DEFINE_int32(cpu_power_option,
+             0,
              "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");
 
-bool SingleInputAndOutput(const std::vector<int64_t> &input_shape,
-                          const std::vector<int64_t> &output_shape) {
-  // load model
-  int64_t t0 = NowMicros();
-  const unsigned char *model_data =
-      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-  int64_t t1 = NowMicros();
-  LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
-  int64_t init_micros = t1 - t0;
-
-  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  LOG(INFO) << "Runing with device type: " << device_type;
-
-  // config runtime
-  if (device_type == DeviceType::OPENCL) {
-    mace::ConfigOpenCLRuntime(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  } else if (device_type == DeviceType::CPU) {
-    mace::ConfigOmpThreadsAndAffinity(
-        FLAGS_omp_num_threads,
-        static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
-  }
-
-  // Init model
-  LOG(INFO) << "Run init";
-  t0 = NowMicros();
-  mace::MaceEngine engine(&net_def, device_type);
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
-    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-  }
-  t1 = NowMicros();
-  init_micros += t1 - t0;
-  LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
-  LOG(INFO) << "Total init latency: " << init_micros << " us";
-
-  // Allocate input and output
-  int64_t input_size =
-      std::accumulate(input_shape.begin(), input_shape.end(), 1,
-                      std::multiplies<int64_t>());
-  int64_t output_size =
-      std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                      std::multiplies<int64_t>());
-  std::unique_ptr<float[]> input_data(new float[input_size]);
-  std::unique_ptr<float[]> output_data(new float[output_size]);
-
-  // load input
-  std::ifstream in_file(FLAGS_input_file + "_" + FormatName(FLAGS_input_node),
-                        std::ios::in | std::ios::binary);
-  if (in_file.is_open()) {
-    in_file.read(reinterpret_cast<char *>(input_data.get()),
-                 input_size * sizeof(float));
-    in_file.close();
-  } else {
-    LOG(INFO) << "Open input file failed";
-    return -1;
-  }
-
-  LOG(INFO) << "Warm up run";
-  t0 = NowMicros();
-  engine.Run(input_data.get(), input_shape, output_data.get());
-  t1 = NowMicros();
-  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
-
-  if (FLAGS_round > 0) {
-    LOG(INFO) << "Run model";
-    t0 = NowMicros();
-    struct mallinfo prev = mallinfo();
-    for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(input_data.get(), input_shape, output_data.get());
-      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
-        LOG(INFO) << "=== check malloc info change #" << i << " ===";
-        prev = LogMallinfoChange(prev);
-      }
-    }
-    t1 = NowMicros();
-    LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
-  }
-
-  if (FLAGS_restart_round == 1) {
-    if (output_data != nullptr) {
-      std::string
-        output_name = FLAGS_output_file + "_" + FormatName(FLAGS_output_node);
-      std::ofstream out_file(output_name, std::ios::binary);
-      out_file.write((const char *) (output_data.get()),
-                     output_size * sizeof(float));
-      out_file.flush();
-      out_file.close();
-      LOG(INFO) << "Write output file "
-                << output_name
-                << " with size " << output_size
-                << " done.";
-    } else {
-      LOG(INFO) << "Output data is null";
-    }
-  }
-
-  return true;
-}
-
-bool MultipleInputOrOutput(
-    const std::vector<std::string> &input_names,
-    const std::vector<std::vector<int64_t>> &input_shapes,
-    const std::vector<std::string> &output_names,
-    const std::vector<std::vector<int64_t>> &output_shapes) {
+bool RunModel(const std::vector<std::string> &input_names,
+              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<std::string> &output_names,
+              const std::vector<std::vector<int64_t>> &output_shapes) {
   // load model
   int64_t t0 = NowMicros();
   const unsigned char *model_data =
@@ -312,42 +220,42 @@ bool MultipleInputOrOutput(
 
   const size_t input_count = input_names.size();
   const size_t output_count = output_names.size();
-  std::vector<mace::MaceInputInfo> input_infos(input_count);
-  std::map<std::string, float*> outputs;
-  std::vector<std::unique_ptr<float[]>> input_datas(input_count);
+
+  std::map<std::string, mace::MaceTensor> inputs;
+  std::map<std::string, mace::MaceTensor> outputs;
   for (size_t i = 0; i < input_count; ++i) {
     // Allocate input and output
     int64_t input_size =
         std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
                         std::multiplies<int64_t>());
-    input_datas[i].reset(new float[input_size]);
+    auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                            std::default_delete<float[]>());
     // load input
     std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
                           std::ios::in | std::ios::binary);
     if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(input_datas[i].get()),
+      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
                    input_size * sizeof(float));
       in_file.close();
     } else {
       LOG(INFO) << "Open input file failed";
       return -1;
     }
-    input_infos[i].name = input_names[i];
-    input_infos[i].shape = input_shapes[i];
-    input_infos[i].data = input_datas[i].get();
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
   }
-  std::vector<std::unique_ptr<float[]>> output_datas(output_count);
+
   for (size_t i = 0; i < output_count; ++i) {
     int64_t output_size =
         std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                         std::multiplies<int64_t>());
-    output_datas[i].reset(new float[output_size]);
-    outputs[output_names[i]] = output_datas[i].get();
+    auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                             std::default_delete<float[]>());
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
   }
 
   LOG(INFO) << "Warm up run";
   t0 = NowMicros();
-  engine.Run(input_infos, outputs);
+  engine.Run(inputs, &outputs);
   t1 = NowMicros();
   LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
 
@@ -356,7 +264,7 @@ bool MultipleInputOrOutput(
     t0 = NowMicros();
     struct mallinfo prev = mallinfo();
     for (int i = 0; i < FLAGS_round; ++i) {
-      engine.Run(input_infos, outputs);
+      engine.Run(inputs, &outputs);
       if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
         LOG(INFO) << "=== check malloc info change #" << i << " ===";
         prev = LogMallinfoChange(prev);
@@ -367,20 +275,19 @@ bool MultipleInputOrOutput(
   }
 
   for (size_t i = 0; i < output_count; ++i) {
-    std::string output_name = FLAGS_output_file + "_"
-        + FormatName(output_names[i]);
+    std::string output_name =
+        FLAGS_output_file + "_" + FormatName(output_names[i]);
     std::ofstream out_file(output_name, std::ios::binary);
     int64_t output_size =
         std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
                         std::multiplies<int64_t>());
-    out_file.write((const char *) outputs[output_names[i]],
-                   output_size * sizeof(float));
+    out_file.write(
+        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
+        output_size * sizeof(float));
     out_file.flush();
     out_file.close();
-    LOG(INFO) << "Write output file "
-              << output_name
-              << " with size " << output_size
-              << " done.";
+    LOG(INFO) << "Write output file " << output_name << " with size "
+              << output_size << " done.";
   }
 
   return true;
@@ -391,7 +298,6 @@ int Main(int argc, char **argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "mace git version: " << MaceGitVersion();
   LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
   LOG(INFO) << "input node: " << FLAGS_input_node;
   LOG(INFO) << "input shape: " << FLAGS_input_shape;
@@ -431,14 +337,8 @@ int Main(int argc, char **argv) {
 #pragma omp parallel for
   for (int i = 0; i < FLAGS_restart_round; ++i) {
     VLOG(0) << "restart round " << i;
-    if (input_count == 1 && output_count == 1) {
-      ret = SingleInputAndOutput(input_shape_vec[0], output_shape_vec[0]);
-    } else {
-      ret = MultipleInputOrOutput(input_names,
-                                  input_shape_vec,
-                                  output_names,
-                                  output_shape_vec);
-    }
+    ret =
+        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
   }
   if (ret) {
     return 0;
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 7a5df69d9ec43953025ee2d1f208e5aac7332ce3..565b3d569934a3b2618ffbf48d278fa0041b8953 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -95,7 +95,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                       static_cast<uint32_t>(buffer->buffer_offset() /
                                             GetEnumTypeSize(buffer->dtype())));
   }
-  if (type == ARGUMENT) {
+  if (type == CONV2D_FILTER) {
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+  } else if (type == ARGUMENT) {
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
   } else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) {
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index 8e2f7184d63a3bed64d47aaaf66cc3b01b62943d..a5d9f289efc7266feb77c7c868b51cfe4b25fb28 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -2,6 +2,7 @@
 
 __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
                                      __private const int input_offset,
+                                     __private const int filter_h,
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
@@ -22,16 +23,18 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
   }
 #endif
 
-  const int out_channel_idx = h * 4;
-  const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
-  const int hw_idx = w / rounded_in_channel;
-  const int in_channel_idx = w % rounded_in_channel;
+  const int in_channel_idx = w;
+  const int hw_size = filter_w * filter_h;
+  const int out_channel_idx = h / hw_size * 4;
+  const int hw_idx = h % hw_size;
   const int h_idx = hw_idx / filter_w;
   const int w_idx = hw_idx % filter_w;
-  const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
-                           + in_channel_idx;
+  const int offset = input_offset
+                     + ((h_idx * filter_w + w_idx) * out_channel
+                         + out_channel_idx) * in_channel
+                     + in_channel_idx;
 
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
   if (out_channel_idx < out_channel) {
     const int size = out_channel - out_channel_idx;
     if (size < 4) {
@@ -52,10 +55,11 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
   }
 
   int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }
 
 __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
+                                     __private const int filter_h,
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
@@ -76,18 +80,19 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
   }
 #endif
 
-  const int out_channel_idx = h * 4;
-  const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
-  const int hw_idx = w / rounded_in_channel;
-  const int in_channel_idx = w % rounded_in_channel;
+  const int in_channel_idx = w;
+  const int hw_size = filter_w * filter_h;
+  const int out_channel_idx = h / hw_size * 4;
+  const int hw_idx = h % hw_size;
   const int h_idx = hw_idx / filter_w;
   const int w_idx = hw_idx % filter_w;
-  const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
-                           + in_channel_idx;
+  const int offset = ((h_idx * filter_w + w_idx) * out_channel
+                         + out_channel_idx) * in_channel
+                     + in_channel_idx;
 
   if (out_channel_idx < out_channel) {
     int2 coord = (int2)(w, h);
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+    DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
     const int size = (out_channel - out_channel_idx);
     if (size < 4) {
       switch (size) {
@@ -200,7 +205,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                            + channel_idx;
 
   const int size = channels - channel_idx;
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
   if (size < 4) {
     switch(size) {
       case 3:
@@ -214,7 +219,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
     values = vload4(0, input + offset);
   }
   int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }
 
 __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
@@ -246,7 +251,7 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                            + channel_idx;
 
   int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
   const int size = channels - channel_idx;
   if (size < 4) {
     switch (size) {
@@ -286,7 +291,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
   const int size = count - w * 4;
 
 
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
+  DATA_TYPE4 values = 0;
   if (size < 4) {
     switch(size) {
       case 3:
@@ -300,7 +305,7 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
     values = vload4(0, input + offset);
   }
   int2 coord = (int2)(w, h);
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+  WRITE_IMAGET(output, coord, values);
 }
 
 __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
@@ -325,7 +330,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
   const int offset = w * 4;
 
   int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
+  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
   const int size = count - offset;
   if (size < 4) {
     switch (size) {
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index f85bf1080dae99432d1786fd9828dcf32c3b6d37..8fa23f02db62c5c43469e7c4f5095830ebeeed1e 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -1,7 +1,7 @@
 #include <common.h>
 
 __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                      __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
+                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
     __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -41,8 +41,6 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   const int out_w_blks = get_global_size(1);
 #endif
 
-  const int rounded_in_ch = in_ch_blks << 2;
-
 #ifdef BIAS
   DATA_TYPE4 out0 =
      READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
@@ -64,21 +62,21 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
 
   const int batch_idx = mul24((out_hb / out_height), in_height);
-  const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
+  const int filter_hw = mul24(filter_width, filter_height);
 
   DATA_TYPE4 in0, in1, in2, in3;
   DATA_TYPE4 weights0, weights1, weights2, weights3;
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
     const int in_idx = mul24(in_ch_blk, in_width);
-    int filter_x_part0 = in_ch_blk << 2;
+    int filter_x_idx = in_ch_blk << 2;
+    int filter_y_idx = mul24(out_ch_blk, filter_hw);
     for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-      // TODO(heliangliang) optimize out these muls
       int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
       in_hb_value = select(in_hb_value + batch_idx,
                            -1,
                            (in_hb_value < 0 || in_hb_value >= in_height));
 
-      int filter_x_part1 = 0;
+#pragma unroll
       for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
         int in_width_value;
 #define READ_INPUT(i)                                                                \
@@ -96,11 +94,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 #undef READ_INPUT
 
         // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
-        int filter_idx = filter_x_part0 + filter_x_part1;
-        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));
 
         out0 = mad(in0.x, weights0, out0);
         out0 = mad(in0.y, weights1, out0);
@@ -123,9 +120,8 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
         out3 = mad(in3.z, weights2, out3);
         out3 = mad(in3.w, weights3, out3);
 
-        filter_x_part1 += rounded_in_ch;
+        filter_y_idx += 1;
       }
-      filter_x_part0 += rounded_in_ch_x_filter_width;
     }
   }
 
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 8f58255ab8100c6597bdb8bb701adbc8406e0537..8ce485b7f3f92e46d3d25a1a95bcc624564d1125 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,7 +1,7 @@
 #include <common.h>
 
 __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                          __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
+                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
                           __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -39,8 +39,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_w_blks = get_global_size(1);
 #endif
 
-  const int rounded_in_ch = in_ch_blks << 2;
-
 #ifdef BIAS
   DATA_TYPE4 out0 =
      READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
@@ -65,19 +63,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
 
   const int batch_idx = mul24((out_hb / out_height), in_height);
-  const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;
 
   DATA_TYPE4 in0, in1, in2, in3, in4;
   DATA_TYPE4 weights0, weights1, weights2, weights3;
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
     const int in_idx = mul24(in_ch_blk, in_width);
-    int filter_x_part0 = in_ch_blk << 2;
+    int filter_x_idx = in_ch_blk << 2;
+    int filter_y_idx = mul24(out_ch_blk, 9);
     int in_hb_idx = height_idx;
     for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
       int in_hb_value = select(in_hb_idx + batch_idx,
                                -1,
                                (in_hb_idx < 0 || in_hb_idx >= in_height));
-      int filter_x_part1 = 0;
       int in_width_idx = 0;
       for (short width_idx = 0; width_idx < 3; ++width_idx) {
         int in_width_value;
@@ -97,11 +94,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #undef READ_INPUT
 
         // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
-        int filter_idx = filter_x_part0 + filter_x_part1;
-        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 0, filter_y_idx));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 1, filter_y_idx));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 2, filter_y_idx));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_idx + 3, filter_y_idx));
 
         out0 = mad(in0.x, weights0, out0);
         out0 = mad(in0.y, weights1, out0);
@@ -129,10 +125,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
         out4 = mad(in4.z, weights2, out4);
         out4 = mad(in4.w, weights3, out4);
 
-        filter_x_part1 += rounded_in_ch;
         in_width_idx += dilation_w;
+        filter_y_idx += 1;
       }
-      filter_x_part0 += rounded_in_ch_x_3;
       in_hb_idx += dilation_h;
     }
   }
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index ba788a26977750ae69d37f90e6661e6612cdcf08..b8b8d6a3cc51de74c9cab7a8f85f86f61d218abd 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -23,13 +23,13 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
   (*image_shape)[1] = shape[0] * shape[1];
 }
 
-// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
+// [RoundUp<4>(Ic), H * W * (Oc + 3) / 4]
 void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
                                std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
   image_shape->resize(2);
-  (*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
-  (*image_shape)[1] = RoundUpDiv4(shape[2]);
+  (*image_shape)[0] = RoundUp<index_t>(shape[3], 4);
+  (*image_shape)[1] = shape[0] * shape[1] * RoundUpDiv4(shape[2]);
 }
 
 // [H * W * M, (Ic + 3) / 4]
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 90e4579eb9c53c4870a083f9871001420509318e..f06a7e127359e391a54b28bb4d35891416f32cbb 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -114,6 +114,7 @@ static void Conv2d(int iters,
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
 
+
 BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256);
 
 BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, 1, VALID, 1024);
@@ -135,6 +136,8 @@ BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
 
+BM_CONV_2D(1, 1024, 16, 16, 15, 1, 1, 1, SAME, 2);
+
 // Dilation
 BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 2, VALID, 32);
 BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
diff --git a/mace/public/BUILD b/mace/public/BUILD
index f51c85b8c15a3d4588ab60f5a3f9cc57ae81681a..41c709878728c120daf7f2c82954f04b4a7e1809 100644
--- a/mace/public/BUILD
+++ b/mace/public/BUILD
@@ -7,11 +7,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//mace:mace.bzl", "if_android")
-
 cc_library(
     name = "public",
     hdrs = [
         "mace.h",
+        "mace_runtime.h",
+        "mace_types.h",
     ],
 )
diff --git a/mace/public/mace.h b/mace/public/mace.h
index eb74e6749517b1c7cd616a0073bd0c3c7965f916..db57fbcb9eaf27c3f8ea2f16ba062c6ca73a09fd 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -2,6 +2,9 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+// This file defines core MACE APIs.
+// There APIs will be stable and backward compatible.
+
 #ifndef MACE_PUBLIC_MACE_H_
 #define MACE_PUBLIC_MACE_H_
 
@@ -13,415 +16,60 @@
 
 namespace mace {
 
-#define MACE_MAJOR_VERSION 0
-#define MACE_MINOR_VERSION 1
-#define MACE_PATCH_VERSION 0
-
-// MACE_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
-// "-beta", "-rc", "-rc.1")
-#define MACE_VERSION_SUFFIX ""
-
-#define MACE_STR_HELPER(x) #x
-#define MACE_STR(x) MACE_STR_HELPER(x)
-
-// e.g. "0.5.0" or "0.6.0-alpha".
-#define MACE_VERSION_STRING                                                    \
-  (MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
-      MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
-
-inline const char *MaceVersion() { return MACE_VERSION_STRING; }
-
-extern const char *MaceGitVersion();
-
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname &) = delete;   \
-  classname &operator=(const classname &) = delete
-#endif
-
-enum NetMode { INIT = 0, NORMAL = 1 };
+const char *MaceVersion();
 
 enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
 
-enum DataType {
-  DT_INVALID = 0,
-  DT_FLOAT = 1,
-  DT_DOUBLE = 2,
-  DT_INT32 = 3,
-  DT_UINT8 = 4,
-  DT_INT16 = 5,
-  DT_INT8 = 6,
-  DT_STRING = 7,
-  DT_INT64 = 8,
-  DT_UINT16 = 9,
-  DT_BOOL = 10,
-  DT_HALF = 19,
-  DT_UINT32 = 22
-};
-
-enum GPUPerfHint {
-  PERF_DEFAULT = 0,
-  PERF_LOW = 1,
-  PERF_NORMAL = 2,
-  PERF_HIGH = 3
-};
-
-enum GPUPriorityHint {
-  PRIORITY_DEFAULT = 0,
-  PRIORITY_LOW = 1,
-  PRIORITY_NORMAL = 2,
-  PRIORITY_HIGH = 3
-};
-
-enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2};
-
-class ConstTensor {
- public:
-  ConstTensor(const std::string &name,
-              const unsigned char *data,
-              const std::vector<int64_t> &dims,
-              const DataType data_type = DT_FLOAT,
-              uint32_t node_id = 0);
-  ConstTensor(const std::string &name,
-              const unsigned char *data,
-              const std::vector<int64_t> &dims,
-              const int data_type,
-              uint32_t node_id = 0);
-
-  const std::string &name() const;
-  const unsigned char *data() const;
-  int64_t data_size() const;
-  const std::vector<int64_t> &dims() const;
-  DataType data_type() const;
-  uint32_t node_id() const;
-
- private:
-  const std::string name_;
-  const unsigned char *data_;
-  const int64_t data_size_;
-  const std::vector<int64_t> dims_;
-  const DataType data_type_;
-  const uint32_t node_id_;
-};
-
-class Argument {
- public:
-  Argument();
-  void CopyFrom(const Argument &from);
-
- public:
-  const std::string &name() const;
-  void set_name(const std::string &value);
-  bool has_f() const;
-  float f() const;
-  void set_f(float value);
-  bool has_i() const;
-  int64_t i() const;
-  void set_i(int64_t value);
-  bool has_s() const;
-  std::string s() const;
-  void set_s(const std::string &value);
-  const std::vector<float> &floats() const;
-  void add_floats(float value);
-  void set_floats(const std::vector<float> &value);
-  const std::vector<int64_t> &ints() const;
-  void add_ints(int64_t value);
-  void set_ints(const std::vector<int64_t> &value);
-  const std::vector<std::string> &strings() const;
-  void add_strings(const ::std::string &value);
-  void set_strings(const std::vector<std::string> &value);
-
- private:
-  void set_has_f();
-  void set_has_i();
-  void set_has_s();
-
- private:
-  std::string name_;
-  float f_;
-  int64_t i_;
-  std::string s_;
-  std::vector<float> floats_;
-  std::vector<int64_t> ints_;
-  std::vector<std::string> strings_;
-  uint32_t has_bits_;
-};
-
-class NodeInput {
- public:
-  NodeInput() {}
-  NodeInput(int node_id, int output_port);
-  void CopyFrom(const NodeInput &from);
-
- public:
-  int node_id() const;
-  void set_node_id(int node_id);
-  int output_port() const;
-  void set_output_port(int output_port);
-
- private:
-  int node_id_;
-  int output_port_;
-};
-
-class OutputShape {
- public:
-  OutputShape();
-  OutputShape(const std::vector<int64_t> &dims);  // NOLINT(runtime/explicit)
-  void CopyFrom(const OutputShape &from);
-
- public:
-  const std::vector<int64_t> &dims() const;
-
- private:
-  std::vector<int64_t> dims_;
-};
-
-class OperatorDef {
- public:
-  void CopyFrom(const OperatorDef &from);
-
- public:
-  const std::string &name() const;
-  void set_name(const std::string &name_);
-  bool has_name() const;
-  const std::string &type() const;
-  void set_type(const std::string &type_);
-  bool has_type() const;
-  const std::vector<int> &mem_id() const;
-  void set_mem_id(const std::vector<int> &value);
-  uint32_t node_id() const;
-  void set_node_id(uint32_t node_id);
-  uint32_t op_id() const;
-  uint32_t padding() const;
-  void set_padding(uint32_t padding);
-  const std::vector<NodeInput> &node_input() const;
-  void add_node_input(const NodeInput &value);
-  const std::vector<int> &out_max_byte_size() const;
-  void add_out_max_byte_size(int value);
-  const std::vector<std::string> &input() const;
-  const std::string &input(int index) const;
-  std::string *add_input();
-  void add_input(const ::std::string &value);
-  void add_input(::std::string &&value);
-  void set_input(const std::vector<std::string> &value);
-  const std::vector<std::string> &output() const;
-  const std::string &output(int index) const;
-  std::string *add_output();
-  void add_output(const ::std::string &value);
-  void add_output(::std::string &&value);
-  void set_output(const std::vector<std::string> &value);
-  const std::vector<Argument> &arg() const;
-  Argument *add_arg();
-  const std::vector<OutputShape> &output_shape() const;
-  void add_output_shape(const OutputShape &value);
-  const std::vector<DataType> &output_type() const;
-  void set_output_type(const std::vector<DataType> &value);
-
- private:
-  void set_has_name();
-  void set_has_type();
-  void set_has_mem_id();
-
- private:
-  std::string name_;
-  std::string type_;
-
-  std::vector<std::string> input_;
-  std::vector<std::string> output_;
-  std::vector<Argument> arg_;
-  std::vector<OutputShape> output_shape_;
-  std::vector<DataType> output_type_;
-
-  std::vector<int> mem_id_;
-
-  // nnlib
-  uint32_t node_id_;
-  uint32_t op_id_;
-  uint32_t padding_;
-  std::vector<NodeInput> node_input_;
-  std::vector<int> out_max_byte_size_;
-
-  uint32_t has_bits_;
-};
-
-class MemoryBlock {
- public:
-  MemoryBlock(int mem_id, uint32_t x, uint32_t y);
-
- public:
-  int mem_id() const;
-  uint32_t x() const;
-  uint32_t y() const;
-
- private:
-  int mem_id_;
-  uint32_t x_;
-  uint32_t y_;
-};
-
-class MemoryArena {
- public:
-  const std::vector<MemoryBlock> &mem_block() const;
-  std::vector<MemoryBlock> &mutable_mem_block();
-  int mem_block_size() const;
-
- private:
-  std::vector<MemoryBlock> mem_block_;
-};
+enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
 
-// for hexagon mace-nnlib
-class InputInfo {
+// MACE input/output tensor
+class MaceTensor {
  public:
-  const std::string &name() const;
-  int32_t node_id() const;
-  int32_t max_byte_size() const;
-  DataType data_type() const;
-  const std::vector<int32_t> &dims() const;
+  // shape - the shape of the tensor, with size n
+  // data - the buffer of the tensor, must not be null with size equals
+  //        shape[0] * shape[1] * ... * shape[n-1]
+  explicit MaceTensor(const std::vector<int64_t> &shape,
+                      std::shared_ptr<float> data);
+  MaceTensor();
+  MaceTensor(const MaceTensor &other);
+  MaceTensor(const MaceTensor &&other);
+  MaceTensor &operator=(const MaceTensor &other);
+  MaceTensor &operator=(const MaceTensor &&other);
+  ~MaceTensor();
 
- private:
-  std::string name_;
-  int32_t node_id_;
-  int32_t max_byte_size_;  // only support 32-bit len
-  DataType data_type_;
-  std::vector<int32_t> dims_;
-};
-
-class OutputInfo {
- public:
-  const std::string &name() const;
-  int32_t node_id() const;
-  int32_t max_byte_size() const;
-  DataType data_type() const;
-  void set_data_type(DataType data_type);
-  const std::vector<int32_t> &dims() const;
-  void set_dims(const std::vector<int32_t> &dims);
+  const std::vector<int64_t> &shape() const;
+  const std::shared_ptr<float> data() const;
+  std::shared_ptr<float> data();
 
  private:
-  std::string name_;
-  int32_t node_id_;
-  int32_t max_byte_size_;  // only support 32-bit len
-  DataType data_type_;
-  std::vector<int32_t> dims_;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
-class NetDef {
- public:
-  NetDef();
-  int op_size() const;
-
-  const OperatorDef &op(const int idx) const;
-
- public:
-  const std::string &name() const;
-  bool has_name() const;
-  void set_name(const std::string &value);
-  const std::string &version() const;
-  bool has_version() const;
-  void set_version(const std::string &value);
-
-  const std::vector<OperatorDef> &op() const;
-  OperatorDef *add_op();
-  std::vector<OperatorDef> &mutable_op();
-  const std::vector<Argument> &arg() const;
-  Argument *add_arg();
-  std::vector<Argument> &mutable_arg();
-  const std::vector<ConstTensor> &tensors() const;
-  std::vector<ConstTensor> &mutable_tensors();
-  const MemoryArena &mem_arena() const;
-  bool has_mem_arena() const;
-  MemoryArena &mutable_mem_arena();
-  const std::vector<InputInfo> &input_info() const;
-  const std::vector<OutputInfo> &output_info() const;
-  std::vector<OutputInfo> &mutable_output_info();
-
- private:
-  void set_has_name();
-  void set_has_version();
-  void set_has_mem_arena();
-
- private:
-  std::string name_;
-  std::string version_;
-  std::vector<OperatorDef> op_;
-  std::vector<Argument> arg_;
-  std::vector<ConstTensor> tensors_;
-
-  // for mem optimization
-  MemoryArena mem_arena_;
-
-  // for hexagon mace-nnlib
-  std::vector<InputInfo> input_info_;
-  std::vector<OutputInfo> output_info_;
-
-  uint32_t has_bits_;
-};
-
-struct CallStats {
-  int64_t start_micros;
-  int64_t end_micros;
-};
-
-struct OperatorStats {
-  std::string operator_name;
-  std::string type;
-  CallStats stats;
-};
-
-struct RunMetadata {
-  std::vector<OperatorStats> op_stats;
-};
-
-class Workspace;
-class NetBase;
-class OperatorRegistry;
-class HexagonControlWrapper;
-
-struct MaceInputInfo {
-  std::string name;
-  std::vector<int64_t> shape;
-  const float *data;
-};
-
-void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
-void ConfigOmpThreadsAndAffinity(int omp_num_threads,
-                                 CPUPowerOption power_option);
+class NetDef;
+class RunMetadata;
 
 class MaceEngine {
  public:
-  // Single input and output
-  explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
-  // Multiple input or output
   explicit MaceEngine(const NetDef *net_def,
                       DeviceType device_type,
                       const std::vector<std::string> &input_nodes,
                       const std::vector<std::string> &output_nodes);
   ~MaceEngine();
-  // Single input and output
-  bool Run(const float *input,
-           const std::vector<int64_t> &input_shape,
-           float *output);
-  // Single input and output for benchmark
-  bool Run(const float *input,
-           const std::vector<int64_t> &input_shape,
-           float *output,
-           RunMetadata *run_metadata);
-  // Multiple input or output
-  bool Run(
-      const std::vector<MaceInputInfo> &input,
-      std::map<std::string, float *> &output,  // NOLINT(runtime/references)
-      RunMetadata *run_metadata = nullptr);
-  MaceEngine(const MaceEngine &) = delete;
-  MaceEngine &operator=(const MaceEngine &) = delete;
+
+  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
+                 std::map<std::string, MaceTensor> *outputs);
+
+  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
+                 std::map<std::string, MaceTensor> *outputs,
+                 RunMetadata *run_metadata);
 
  private:
-  std::shared_ptr<OperatorRegistry> op_registry_;
-  DeviceType device_type_;
-  std::unique_ptr<Workspace> ws_;
-  std::unique_ptr<NetBase> net_;
-  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+  MaceEngine(const MaceEngine &) = delete;
+  MaceEngine &operator=(const MaceEngine &) = delete;
 };
 
 }  // namespace mace
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..1887f00993301198d33f6a32e5d601c14b8a998b
--- /dev/null
+++ b/mace/public/mace_runtime.h
@@ -0,0 +1,35 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+// This file defines runtime tuning APIs.
+// These APIs are not stable.
+
+#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
+#define MACE_PUBLIC_MACE_RUNTIME_H_
+
+namespace mace {
+
+enum GPUPerfHint {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+};
+
+enum GPUPriorityHint {
+  PRIORITY_DEFAULT = 0,
+  PRIORITY_LOW = 1,
+  PRIORITY_NORMAL = 2,
+  PRIORITY_HIGH = 3
+};
+
+enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 };
+
+void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
+void ConfigOmpThreadsAndAffinity(int omp_num_threads,
+                                 CPUPowerOption power_option);
+
+}  // namespace mace
+
+#endif  // MACE_PUBLIC_MACE_RUNTIME_H_
diff --git a/mace/public/mace_types.h b/mace/public/mace_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..141825830b1faa82d9cca82fee48b839ed6c0d1b
--- /dev/null
+++ b/mace/public/mace_types.h
@@ -0,0 +1,341 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+// This file defines data types used by net creation and benchmark tools.
+// These APIs are not stable and should only be used by advanced users.
+
+#ifndef MACE_PUBLIC_MACE_TYPES_H_
+#define MACE_PUBLIC_MACE_TYPES_H_
+
+#include <string>
+#include <vector>
+
+namespace mace {
+
+// Disable the copy and assignment operator for a class.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+ private:                                  \
+  classname(const classname &) = delete;   \
+  classname &operator=(const classname &) = delete
+#endif
+
+enum NetMode { INIT = 0, NORMAL = 1 };
+
+enum DataType {
+  DT_INVALID = 0,
+  DT_FLOAT = 1,
+  DT_DOUBLE = 2,
+  DT_INT32 = 3,
+  DT_UINT8 = 4,
+  DT_INT16 = 5,
+  DT_INT8 = 6,
+  DT_STRING = 7,
+  DT_INT64 = 8,
+  DT_UINT16 = 9,
+  DT_BOOL = 10,
+  DT_HALF = 19,
+  DT_UINT32 = 22
+};
+
+class ConstTensor {
+ public:
+  ConstTensor(const std::string &name,
+              const unsigned char *data,
+              const std::vector<int64_t> &dims,
+              const DataType data_type = DT_FLOAT,
+              uint32_t node_id = 0);
+  ConstTensor(const std::string &name,
+              const unsigned char *data,
+              const std::vector<int64_t> &dims,
+              const int data_type,
+              uint32_t node_id = 0);
+
+  const std::string &name() const;
+  const unsigned char *data() const;
+  int64_t data_size() const;
+  const std::vector<int64_t> &dims() const;
+  DataType data_type() const;
+  uint32_t node_id() const;
+
+ private:
+  const std::string name_;
+  const unsigned char *data_;
+  const int64_t data_size_;
+  const std::vector<int64_t> dims_;
+  const DataType data_type_;
+  const uint32_t node_id_;
+};
+
+class Argument {
+ public:
+  Argument();
+  void CopyFrom(const Argument &from);
+
+ public:
+  const std::string &name() const;
+  void set_name(const std::string &value);
+  bool has_f() const;
+  float f() const;
+  void set_f(float value);
+  bool has_i() const;
+  int64_t i() const;
+  void set_i(int64_t value);
+  bool has_s() const;
+  std::string s() const;
+  void set_s(const std::string &value);
+  const std::vector<float> &floats() const;
+  void add_floats(float value);
+  void set_floats(const std::vector<float> &value);
+  const std::vector<int64_t> &ints() const;
+  void add_ints(int64_t value);
+  void set_ints(const std::vector<int64_t> &value);
+  const std::vector<std::string> &strings() const;
+  void add_strings(const ::std::string &value);
+  void set_strings(const std::vector<std::string> &value);
+
+ private:
+  void set_has_f();
+  void set_has_i();
+  void set_has_s();
+
+ private:
+  std::string name_;
+  float f_;
+  int64_t i_;
+  std::string s_;
+  std::vector<float> floats_;
+  std::vector<int64_t> ints_;
+  std::vector<std::string> strings_;
+  uint32_t has_bits_;
+};
+
+class NodeInput {
+ public:
+  NodeInput() {}
+  NodeInput(int node_id, int output_port);
+  void CopyFrom(const NodeInput &from);
+
+ public:
+  int node_id() const;
+  void set_node_id(int node_id);
+  int output_port() const;
+  void set_output_port(int output_port);
+
+ private:
+  int node_id_;
+  int output_port_;
+};
+
+class OutputShape {
+ public:
+  OutputShape();
+  OutputShape(const std::vector<int64_t> &dims);  // NOLINT(runtime/explicit)
+  void CopyFrom(const OutputShape &from);
+
+ public:
+  const std::vector<int64_t> &dims() const;
+
+ private:
+  std::vector<int64_t> dims_;
+};
+
+class OperatorDef {
+ public:
+  void CopyFrom(const OperatorDef &from);
+
+ public:
+  const std::string &name() const;
+  void set_name(const std::string &name_);
+  bool has_name() const;
+  const std::string &type() const;
+  void set_type(const std::string &type_);
+  bool has_type() const;
+  const std::vector<int> &mem_id() const;
+  void set_mem_id(const std::vector<int> &value);
+  uint32_t node_id() const;
+  void set_node_id(uint32_t node_id);
+  uint32_t op_id() const;
+  uint32_t padding() const;
+  void set_padding(uint32_t padding);
+  const std::vector<NodeInput> &node_input() const;
+  void add_node_input(const NodeInput &value);
+  const std::vector<int> &out_max_byte_size() const;
+  void add_out_max_byte_size(int value);
+  const std::vector<std::string> &input() const;
+  const std::string &input(int index) const;
+  std::string *add_input();
+  void add_input(const ::std::string &value);
+  void add_input(::std::string &&value);
+  void set_input(const std::vector<std::string> &value);
+  const std::vector<std::string> &output() const;
+  const std::string &output(int index) const;
+  std::string *add_output();
+  void add_output(const ::std::string &value);
+  void add_output(::std::string &&value);
+  void set_output(const std::vector<std::string> &value);
+  const std::vector<Argument> &arg() const;
+  Argument *add_arg();
+  const std::vector<OutputShape> &output_shape() const;
+  void add_output_shape(const OutputShape &value);
+  const std::vector<DataType> &output_type() const;
+  void set_output_type(const std::vector<DataType> &value);
+
+ private:
+  void set_has_name();
+  void set_has_type();
+  void set_has_mem_id();
+
+ private:
+  std::string name_;
+  std::string type_;
+
+  std::vector<std::string> input_;
+  std::vector<std::string> output_;
+  std::vector<Argument> arg_;
+  std::vector<OutputShape> output_shape_;
+  std::vector<DataType> output_type_;
+
+  std::vector<int> mem_id_;
+
+  // nnlib
+  uint32_t node_id_;
+  uint32_t op_id_;
+  uint32_t padding_;
+  std::vector<NodeInput> node_input_;
+  std::vector<int> out_max_byte_size_;
+
+  uint32_t has_bits_;
+};
+
+class MemoryBlock {
+ public:
+  MemoryBlock(int mem_id, uint32_t x, uint32_t y);
+
+ public:
+  int mem_id() const;
+  uint32_t x() const;
+  uint32_t y() const;
+
+ private:
+  int mem_id_;
+  uint32_t x_;
+  uint32_t y_;
+};
+
+class MemoryArena {
+ public:
+  const std::vector<MemoryBlock> &mem_block() const;
+  std::vector<MemoryBlock> &mutable_mem_block();
+  int mem_block_size() const;
+
+ private:
+  std::vector<MemoryBlock> mem_block_;
+};
+
+// for hexagon mace-nnlib
+class InputInfo {
+ public:
+  const std::string &name() const;
+  int32_t node_id() const;
+  int32_t max_byte_size() const;
+  DataType data_type() const;
+  const std::vector<int32_t> &dims() const;
+
+ private:
+  std::string name_;
+  int32_t node_id_;
+  int32_t max_byte_size_;  // only support 32-bit len
+  DataType data_type_;
+  std::vector<int32_t> dims_;
+};
+
+class OutputInfo {
+ public:
+  const std::string &name() const;
+  int32_t node_id() const;
+  int32_t max_byte_size() const;
+  DataType data_type() const;
+  void set_data_type(DataType data_type);
+  const std::vector<int32_t> &dims() const;
+  void set_dims(const std::vector<int32_t> &dims);
+
+ private:
+  std::string name_;
+  int32_t node_id_;
+  int32_t max_byte_size_;  // only support 32-bit len
+  DataType data_type_;
+  std::vector<int32_t> dims_;
+};
+
+class NetDef {
+ public:
+  NetDef();
+  int op_size() const;
+
+  const OperatorDef &op(const int idx) const;
+
+ public:
+  const std::string &name() const;
+  bool has_name() const;
+  void set_name(const std::string &value);
+  const std::string &version() const;
+  bool has_version() const;
+  void set_version(const std::string &value);
+
+  const std::vector<OperatorDef> &op() const;
+  OperatorDef *add_op();
+  std::vector<OperatorDef> &mutable_op();
+  const std::vector<Argument> &arg() const;
+  Argument *add_arg();
+  std::vector<Argument> &mutable_arg();
+  const std::vector<ConstTensor> &tensors() const;
+  std::vector<ConstTensor> &mutable_tensors();
+  const MemoryArena &mem_arena() const;
+  bool has_mem_arena() const;
+  MemoryArena &mutable_mem_arena();
+  const std::vector<InputInfo> &input_info() const;
+  const std::vector<OutputInfo> &output_info() const;
+  std::vector<OutputInfo> &mutable_output_info();
+
+ private:
+  void set_has_name();
+  void set_has_version();
+  void set_has_mem_arena();
+
+ private:
+  std::string name_;
+  std::string version_;
+  std::vector<OperatorDef> op_;
+  std::vector<Argument> arg_;
+  std::vector<ConstTensor> tensors_;
+
+  // for mem optimization
+  MemoryArena mem_arena_;
+
+  // for hexagon mace-nnlib
+  std::vector<InputInfo> input_info_;
+  std::vector<OutputInfo> output_info_;
+
+  uint32_t has_bits_;
+};
+
+struct CallStats {
+  int64_t start_micros;
+  int64_t end_micros;
+};
+
+struct OperatorStats {
+  std::string operator_name;
+  std::string type;
+  CallStats stats;
+};
+
+struct RunMetadata {
+  std::vector<OperatorStats> op_stats;
+};
+
+}  // namespace mace
+
+#endif  // MACE_PUBLIC_MACE_TYPES_H_
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
index 7c7cd9abd71cb8b4720f782ffc71835033c3e97c..166bb6ec3012f2f3075e6dc5577dd5a9e6832463 100644
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -72,9 +72,9 @@ class Shapes(object):
     output_shape = np.zeros_like(input_shape)
     output_shape[0] = input_shape[0]
     output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
-                       - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+                                      - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
     output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
-                       - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+                                      - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
     output_shape[3] = filter_shape[2]
     return output_shape
 
@@ -247,12 +247,9 @@ class CaffeConverter(object):
     arg.i = self.dt
     return output_name
 
-  def add_input_transform(self, names, is_single):
+  def add_input_transform(self, names):
     for name in names:
-      if is_single:
-        new_input_name = MACE_INPUT_NODE_NAME + ":0"
-      else:
-        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
       op_def = self.net_def.op.add()
       op_def.name = name
       op_def.type = 'BufferToImage'
@@ -267,12 +264,9 @@ class CaffeConverter(object):
       arg.name = 'T'
       arg.i = self.dt
 
-  def add_output_transform(self, names, is_single):
+  def add_output_transform(self, names):
     for name in names:
-      if is_single:
-        output_name = MACE_OUTPUT_NODE_NAME + ":0"
-      else:
-        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
       op_def = self.net_def.op.add()
       op_def.name = output_name[:-2]
       op_def.type = 'ImageToBuffer'
@@ -333,8 +327,18 @@ class CaffeConverter(object):
     return pad, stride, kernel
 
   def convert_conv2d(self, op):
-    op_def = self.CommonConvert(op, 'Conv2D')
     param = op.layer.convolution_param
+    is_depthwise = False
+    if param.HasField('group'):
+      if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
+        is_depthwise = True
+      else:
+        raise Exception("Mace do not support group convolution yet")
+
+    if is_depthwise:
+      op_def = self.CommonConvert(op, 'DepthwiseConv2d')
+    else:
+      op_def = self.CommonConvert(op, 'Conv2D')
 
     # Add filter
     weight_tensor_name = op.name + '_weight:0'
@@ -342,7 +346,7 @@ class CaffeConverter(object):
     self.add_tensor(weight_tensor_name, weight_data)
 
     if self.device == 'gpu':
-      buffer_type = "CONV2D_FILTER"
+      buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER"
       output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
       op_def.input.extend([output_name])
     else:
@@ -373,15 +377,16 @@ class CaffeConverter(object):
     self.resolved_ops.add(op.name)
 
     output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-                                          weight_data.shape,
-                                          paddings, strides, dilations,
-                                          math.floor)
+      weight_data.shape,
+      paddings, strides, dilations,
+      math.floor)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     if len(self.ops_map[final_op.name].children) == 1 \
         and self.ops_map[final_op.name].children[0].type in activation_name_map:
       activation_op = self.ops_map[final_op.name].children[0]
-      op_def.type = "FusedConv2D"
+      if not is_depthwise:
+        op_def.type = "FusedConv2D"
       fused_act_arg = op_def.arg.add()
       fused_act_arg.name = 'activation'
       fused_act_arg.s = activation_name_map[activation_op.type]
@@ -412,7 +417,7 @@ class CaffeConverter(object):
     width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
     return self.winograd and self.device == 'gpu' and \
            filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
-           dilations[0] == 1 and (dilations[0] == dilations[1]) and\
+           dilations[0] == 1 and (dilations[0] == dilations[1]) and \
            (strides[0] == 1) and (strides[0] == strides[1]) and \
            (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
            (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
@@ -662,7 +667,7 @@ class CaffeConverter(object):
 
     filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
     output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
-                                          paddings, strides, [1, 1], math.ceil)
+      paddings, strides, [1, 1], math.ceil)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     op_def.output.extend([op.name + ':0'])
@@ -764,7 +769,7 @@ class CaffeConverter(object):
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     num_outputs = len(op.layer.top)
     if (input_shape[3] % num_outputs) != 0 or \
-      (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
+        (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
       raise Exception('Mace do not support slice with input shape '
                       + str(input_shape) + ' and number of output ' + str(num_outputs))
     output_shape = Shapes.slice_shape(input_shape, num_outputs)
@@ -789,7 +794,6 @@ class CaffeConverter(object):
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     output_shape = input_shape
     shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
-    print shape_param
     for i in range(len(shape_param)):
       if shape_param[i] != 0:
         output_shape[i] = shape_param[i]
@@ -844,29 +848,20 @@ class CaffeConverter(object):
     self.net_def.op.extend([op_def])
     self.resolved_ops.add(op.name)
 
-  def replace_in_out_name(self, input_names, output_names, is_single):
+  def replace_in_out_name(self, input_names, output_names):
     in_names = set([input_name + ":0" for input_name in input_names])
     out_names = set([output_name + ":0" for output_name in output_names])
-    if is_single:
-      for op in self.net_def.op:
-        for i in range(len(op.input)):
-          if op.input[i] in in_names:
-            op.input[i] = MACE_INPUT_NODE_NAME + ':0'
-        for i in range(len(op.output)):
-          if op.output[i] in out_names:
-            op.output[i] = MACE_OUTPUT_NODE_NAME + ':0'
-    else:
-      for op in self.net_def.op:
-        for i in range(len(op.input)):
-          if op.input[i] in in_names:
-            op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
-          if op.input[i] in out_names:
-            op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
-        for i in range(len(op.output)):
-          if op.output[i] in in_names:
-            op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
-          if op.output[i] in out_names:
-            op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
+    for op in self.net_def.op:
+      for i in range(len(op.input)):
+        if op.input[i] in in_names:
+          op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
+        if op.input[i] in out_names:
+          op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
+      for i in range(len(op.output)):
+        if op.output[i] in in_names:
+          op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
+        if op.output[i] in out_names:
+          op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
 
   def add_input_op_shape(self, input_nodes, input_shapes):
     assert len(input_nodes) == len(input_shapes)
@@ -878,9 +873,8 @@ class CaffeConverter(object):
         input_op.output_shape_map[input_op.name] = input_shapes[i]
 
   def convert(self, input_nodes, input_shapes, output_nodes):
-    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
     if self.device == 'gpu':
-      self.add_input_transform(input_nodes, is_single)
+      self.add_input_transform(input_nodes)
 
     assert self.ops[0].type == 'Input'
     self.add_input_op_shape(input_nodes, input_shapes)
@@ -925,10 +919,10 @@ class CaffeConverter(object):
         raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
 
     if self.device == 'gpu':
-      self.add_output_transform(output_nodes, is_single)
+      self.add_output_transform(output_nodes)
 
     if self.device == 'cpu':
-      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+      self.replace_in_out_name(input_nodes, output_nodes)
 
     for op in self.ops:
       if op.name not in self.resolved_ops:
@@ -967,3 +961,4 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str,
     print "Memory optimization done."
 
   return net_def
+
diff --git a/mace/python/tools/model_header.jinja2 b/mace/python/tools/model_header.jinja2
index 9f5c776d52bd6456bf3c410216f5b4de1ce1fa58..efd477ee253ccead1ebf2f58c54f624629118280 100644
--- a/mace/python/tools/model_header.jinja2
+++ b/mace/python/tools/model_header.jinja2
@@ -1,7 +1,10 @@
 //
 // Copyright (c) 2017 XiaoMi All rights reserved.
-// Generated by the mace converter.  DO NOT EDIT!
 //
+// Generated by the mace converter.  DO NOT EDIT!
+
+#ifndef MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
+#define MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
 
 #include <string>
 
@@ -10,13 +13,16 @@
 namespace mace {
 namespace {{tag}} {
 
-extern const unsigned char *LoadModelData(const char *model_data_file);
+const unsigned char *LoadModelData(const char *model_data_file);
 
-extern void UnloadModelData(const unsigned char *model_data);
+void UnloadModelData(const unsigned char *model_data);
 
-extern NetDef CreateNet(const unsigned char *model_data);
+NetDef CreateNet(const unsigned char *model_data);
 
-extern const std::string ModelChecksum();
+const std::string ModelChecksum();
 
 }  // namespace {{ tag }}
 }  // namespace mace
+
+#endif  // MACE_CODEGEN_MODELS_{{tag|upper}}_{{tag|upper}}_H_
+
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index 7177a691dabae5bac8fe0fd884d05850d4bac586..01e73645e8439d77ba5b4b6bada8f84e7c3eae9a 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -118,12 +118,9 @@ class TFConverter(object):
     arg.i = self.dt
     return output_name
 
-  def add_input_transform(self, names, is_single):
+  def add_input_transform(self, names):
     for name in names:
-      if is_single:
-        new_input_name = MACE_INPUT_NODE_NAME + ":0"
-      else:
-        new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
       op_def = self.net_def.op.add()
       op_def.name = name
       op_def.type = 'BufferToImage'
@@ -138,12 +135,9 @@ class TFConverter(object):
       arg.name = 'T'
       arg.i = self.dt
 
-  def add_output_transform(self, names, is_single):
+  def add_output_transform(self, names):
     for name in names:
-      if is_single:
-        output_name = MACE_OUTPUT_NODE_NAME + ":0"
-      else:
-        output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
       op_def = self.net_def.op.add()
       op_def.name = output_name[:-2]
       op_def.type = 'ImageToBuffer'
@@ -362,7 +356,8 @@ class TFConverter(object):
     if len(self.tf_graph.get(final_op.name, [])) == 1 \
         and self.tf_graph[final_op.name][0].type in activation_name_map:
       activation_op = self.tf_graph[final_op.name][0]
-      op_def.type = "FusedConv2D"
+      if op_def.type == "Conv2D":
+        op_def.type = "FusedConv2D"
       fused_act_arg = op_def.arg.add()
       fused_act_arg.name = 'activation'
       fused_act_arg.s = activation_name_map[activation_op.type]
@@ -805,26 +800,18 @@ class TFConverter(object):
     self.add_output_shape(op.outputs, op_def)
     self.resolved_ops[op.name] = 1
 
-  def replace_in_out_name(self, input_names, output_names, is_single):
+  def replace_in_out_name(self, input_names, output_names):
     in_names = set([input_name + ":0" for input_name in input_names])
     out_names = set([output_name + ":0" for output_name in output_names])
-    if is_single:
-      for op in self.net_def.op:
-        if len(op.input) > 0 and op.input[0] in in_names:
-          op.input[0] = MACE_INPUT_NODE_NAME + ':0'
-        if len(op.output) > 0 and op.output[0] in out_names:
-          op.output[0] = MACE_OUTPUT_NODE_NAME + ':0'
-    else:
-      for op in self.net_def.op:
-        if len(op.input) > 0 and op.input[0] in in_names:
-          op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
-        if len(op.output) > 0 and op.output[0] in out_names:
-          op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
+    for op in self.net_def.op:
+      if op.input[0] in in_names:
+        op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
+      if op.output[0] in out_names:
+        op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
 
   def convert(self, input_nodes, output_nodes):
-    is_single = len(input_nodes) == 1 and len(output_nodes) == 1
     if self.device == 'gpu':
-      self.add_input_transform(input_nodes, is_single)
+      self.add_input_transform(input_nodes)
 
     for op in self.tf_ops:
       if self.resolved_ops[op.name] == 1:
@@ -892,10 +879,10 @@ class TFConverter(object):
         raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
 
     if self.device == 'gpu':
-      self.add_output_transform(output_nodes, is_single)
+      self.add_output_transform(output_nodes)
 
     if self.device == 'cpu':
-      self.replace_in_out_name(input_nodes, output_nodes, is_single)
+      self.replace_in_out_name(input_nodes, output_nodes)
 
     for key in self.resolved_ops:
       if self.resolved_ops[key] != 1:
diff --git a/mace/tools/git/gen_version_source.sh b/mace/tools/git/gen_version_source.sh
index 6b17c56c5e5f43a6113806517f59acb15a97a70f..36b0233f1e0edffcefe3079437df45e1cd45bd32 100644
--- a/mace/tools/git/gen_version_source.sh
+++ b/mace/tools/git/gen_version_source.sh
@@ -25,6 +25,6 @@ cat <<EOF > ${OUTPUT_FILENAME}
 // This is a generated file, DO NOT EDIT
 
 namespace mace {
-  const char *MaceGitVersion() { return "${GIT_VERSION}"; }
+  const char *MaceVersion() { return "${GIT_VERSION}"; }
 }  // namespace mace
 EOF
diff --git a/mace/utils/logging.h b/mace/utils/logging.h
index 4a6da3befb985a57d702a1766799722f6271bb36..594381437ea5a9a8261fa540d27c770119921c8f 100644
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
@@ -12,6 +12,7 @@
 #include <utility>
 
 #include "mace/public/mace.h"
+#include "mace/public/mace_types.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/string_util.h"
 
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
index c9a22f6472e33f8b8245cee9da5796c32d5d5e1d..4f2b209a700439fffd6f466551c0dffceb555805 100644
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -76,26 +76,28 @@ def generate_random_input(target_soc, model_output_dir,
       target_soc, model_output_dir, int(generate_data_or_not))
   run_command(command)
 
-  input_name_list = []
   input_file_list = []
-  if isinstance(input_names, list):
-    input_name_list.extend(input_names)
-  else:
-    input_name_list.append(input_names)
   if isinstance(input_files, list):
     input_file_list.extend(input_files)
   else:
     input_file_list.append(input_files)
-  assert len(input_file_list) == len(input_name_list)
-  for i in range(len(input_file_list)):
-    if input_file_list[i] is not None:
-      dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
-      if input_file_list[i].startswith("http://") or \
-          input_file_list[i].startswith("https://"):
-        urllib.urlretrieve(input_file_list[i], dst_input_file)
-      else:
-        print 'Copy input data:', dst_input_file
-        shutil.copy(input_file_list[i], dst_input_file)
+  if len(input_file_list) != 0:
+    input_name_list = []
+    if isinstance(input_names, list):
+      input_name_list.extend(input_names)
+    else:
+      input_name_list.append(input_names)
+    if len(input_file_list) != len(input_name_list):
+      raise Exception('If input_files set, the input files should match the input names.')
+    for i in range(len(input_file_list)):
+      if input_file_list[i] is not None:
+        dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
+        if input_file_list[i].startswith("http://") or \
+            input_file_list[i].startswith("https://"):
+          urllib.urlretrieve(input_file_list[i], dst_input_file)
+        else:
+          print 'Copy input data:', dst_input_file
+          shutil.copy(input_file_list[i], dst_input_file)
 
 def generate_model_code():
   command = "bash tools/generate_model_code.sh"