Merge branch 'master' of v9.git.n.xiaomi.com:deep-computing/mace into load_model_in_pb

bdaa5c18 · yejianwu · 221b0c0d · ca12d9f8 · bdaa5c18 · bdaa5c18
72 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,8 @@ docs:
    - cd docs
    - make html
    - CI_LATEST_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/latest
-    - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_BUILD_ID
+    - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_PIPELINE_ID
+    - rm -rf $CI_JOB_OUTPUT_PATH
    - mkdir -p $CI_JOB_OUTPUT_PATH
    - cp -r _build/html $CI_JOB_OUTPUT_PATH/docs
    - rm -rf $CI_LATEST_OUTPUT_PATH

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -110,7 +110,8 @@ RUN apt-get install -y --no-install-recommends \

 # Install tools
 RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools
-RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.6.0 \
+RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.7.0 \
+    numpy>=1.14.0 \
    scipy \
    jinja2 \
    pyyaml \

--- a/docs/faq.md
+++ b/docs/faq.md
 Frequently asked questions
 ==========================

+Does the tensor data consume extra memory when compiled into C++ code?
+----------------------------------------------------------------------
+When compiled into C++ code, the data will be mmaped by the system loader.
+For CPU runtime, the tensor data are used without memory copy.
+For GPU and DSP runtime, the tensor data is used once during model
+initialization. The operating system is free to swap the pages out, however,
+it still consumes virtual memory space. So generally speaking, it takes
+no extra physical memory. If you are short of virtual memory space (this
+should be very rare), you can choose load the tensor data from a file, which
+can be unmapped after initialization.
+
 Why is the generated static library file size so huge?
 -------------------------------------------------------
 The static library is simply an archive of a set of object files which are

--- a/docs/getting_started/docker.md
+++ b/docs/getting_started/docker.md
-Docker Images
-=============
-
-* Login in [Xiaomi Docker Registry](http://docs.api.xiaomi.net/docker-registry/)
-
-```
-docker login cr.d.xiaomi.net
-```
-
-* Build with `Dockerfile`
-
-```
-docker build -t cr.d.xiaomi.net/mace/mace-dev
-```
-
-* Pull image from docker registry
-
-```
-docker pull cr.d.xiaomi.net/mace/mace-dev
-```
-
-* Create container
-
-```
-# Set 'host' network to use ADB
-docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
-```
--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -33,11 +33,13 @@ How to build
 +=====================+=================+===================================================================================================+
 | bazel               | >= 0.5.4        | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| android-ndk         | r12c            | -                                                                                                 |
+| android-ndk         | r15c,r16b       | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | adb                 | >= 1.0.32       | apt install -y android-tools-adb                                                                  |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| tensorflow          | 1.4.0           | pip install tensorflow==1.4.0                                                                     |
+| tensorflow          | 1.7.0           | pip install tensorflow==1.7.0                                                                     |
+---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| numpy               | >= 1.14.0       | pip install numpy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | scipy               | >= 1.0.0        | pip install scipy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
@@ -45,9 +47,43 @@ How to build
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | PyYaml              | >= 3.12         | pip install pyyaml                                                                                |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| sh                  | >= 1.12.14      | pip install sh                                                                                    |
+---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| filelock            | >= 3.0.0        | pip install filelock                                                                              |
+---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | docker(for caffe)   | >= 17.09.0-ce   | `install doc <https://docs.docker.com/install/linux/docker-ce/ubuntu/#set-up-the-repository>`__   |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+

+Docker Images
+----------------
+
+* Login in `Xiaomi Docker Registry <http://docs.api.xiaomi.net/docker-registry/>`__
+
+.. code:: sh
+
+    docker login cr.d.xiaomi.net
+
+* Build with Dockerfile
+
+.. code:: sh
+
+    docker build -t cr.d.xiaomi.net/mace/mace-dev
+
+
+* Pull image from docker registry
+
+.. code:: sh
+
+    docker pull cr.d.xiaomi.net/mace/mace-dev
+
+* Create container
+
+.. code:: sh
+
+    # Set 'host' network to use ADB
+    docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
+
+
 使用简介
 --------

@@ -199,29 +235,47 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级

    // 引入头文件
    #include "mace/public/mace.h"
-    #include "mace/public/{MODEL_TAG}.h"
+    #include "mace/public/mace_engine_factory.h"

-    // 0. 设置内部存储
+    // 0. 设置内部存储（设置一次即可）
    const std::string file_path ="/path/to/store/internel/files";
    std::shared_ptr<KVStorageFactory> storage_factory(
        new FileStorageFactory(file_path));
    ConfigKVStorageFactory(storage_factory);

-    //1. 从文件或代码中Load模型数据，也可通过自定义的方式来Load (例如可自己实现压缩加密等)
-    // 如果使用的是数据嵌入的方式，将参数设为nullptr。
-    unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-
-    //2. 创建net对象
-    NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+    //1. 声明设备类型(必须与build时指定的runtime一致）
+    DeviceType device_type = DeviceType::GPU;

-    //3. 声明设备类型(必须与build时指定的runtime一致）
-    DeviceType device_type = DeviceType::OPENCL;
-
-    //4. 定义输入输出名称数组
+    //2. 定义输入输出名称数组
    std::vector<std::string> input_names = {...};
    std::vector<std::string> output_names = {...};

-    //5. 创建输入输出对象
+    //3. 创建MaceEngine对象
+    std::shared_ptr<mace::MaceEngine> engine;
+    MaceStatus create_engine_status;
+    // Create Engine
+    if (model_data_file.empty()) {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           nullptr,
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    } else {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           model_data_file.c_str(),
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    }
+    if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+      // do something
+    }
+
+    //4. 创建输入输出对象
    std::map<std::string, mace::MaceTensor> inputs;
    std::map<std::string, mace::MaceTensor> outputs;
    for (size_t i = 0; i < input_count; ++i) {
@@ -246,14 +300,6 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级
      outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
    }

-    //6. 创建MaceEngine对象
-    mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-
-    //7. 如果设备类型是OPENCL或HEXAGON，可以在此释放model_data
-    if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
-      mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-    }
-
-    //8. 执行模型，得到结果
+    //5. 执行模型，得到结果
    engine.Run(inputs, &outputs);

--- a/docs/getting_started/introduction.md
+++ b/docs/getting_started/introduction.md
-Introduction
-============
-
-TODO: describe the conceptions and workflow with diagram.
-![alt text](workflow.jpg "MiAI workflow")
-
-TODO: describe the runtime.
-
--- a/docs/getting_started/introduction.rst
+++ b/docs/getting_started/introduction.rst
+Introduction
+============
+
+MiAI Compute Engine is a deep learning inference framework optimized for
+mobile heterogeneous computing platforms. The following figure shows the
+overall architecture.
+
+.. image:: mace-arch.png
+   :scale: 40 %
+   :align: center
+
+Model format
+------------
+
+MiAI Compute Engine defines a customized model format which is similar to
+Caffe2. The MiAI model can be converted from exported models by TensorFlow
+and Caffe. We define a YAML schema to describe the model deployment. In the
+next chapter, there is a detailed guide showing how to create this YAML file.
+
+Model conversion
+----------------
+
+Currently, we provide model converters for TensorFlow and Caffe. And
+more frameworks will be supported in the future.
+
+Model loading
+-------------
+
+The MiAI model format contains two parts: the model graph definition and
+the model parameter tensors. The graph part utilizes Protocol Buffers
+for serialization. All the model parameter tensors are concatenated
+together into a continuous array, and we call this array tensor data in
+the following paragraphs. In the model graph, the tensor data offsets
+and lengths are recorded.
+
+The models can be loaded in 3 ways:
+
+1. Both model graph and tensor data are dynamically loaded externally
+   (by default, from file system, but the users are free to choose their own
+   implementations, for example, with compression or encryption). This
+   approach provides the most flexibility but the weakest model protection.
+2. Both model graph and tensor data are converted into C++ code and loaded
+   by executing the compiled code. This approach provides the strongest
+   model protection and simplest deployment.
+3. The model graph is converted into C++ code and constructed as the second
+   approach, and the tensor data is loaded externally as the first approach.
--- a/docs/getting_started/mace-arch.png
+++ b/docs/getting_started/mace-arch.png
--- a/docs/getting_started/op_lists.rst
+++ b/docs/getting_started/op_lists.rst
@@ -6,20 +6,22 @@ Operator lists
    :widths: auto
    :header: "Operator","Android NN","Supported","Remark"

-    "ADD","Y","Y",""
    "AVERAGE_POOL_2D","Y","Y",""
    "BATCH_NORM","","Y","Fusion with activation is supported"
+    "BATCH_TO_SPACE_ND","Y","Y",""
    "BIAS_ADD","","Y",""
    "CHANNEL_SHUFFLE","","Y",""
-    "CONCATENATION","Y","Y",""
+    "CONCATENATION","Y","Y","Only support channel axis concatenation"
    "CONV_2D","Y","Y","Fusion with BN and activation layer is supported"
+    "DECONV_2D","N","Y","Only tensorflow model is supported"
    "DEPTHWISE_CONV_2D","Y","Y","Only multiplier = 1 is supported; Fusion is supported"
    "DEPTH_TO_SPACE","Y","Y",""
-    "DEQUANTIZE","Y","",""
+    "DEQUANTIZE","Y","Y","Model quantization will be supported later"
+    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW"
    "EMBEDDING_LOOKUP","Y","",""
    "FLOOR","Y","",""
    "FULLY_CONNECTED","Y","Y",""
-    "GROUP_CONV_2D","","",""
+    "GROUP_CONV_2D","","","Caffe model with group count = channel count is supported"
    "HASHTABLE_LOOKUP","Y","",""
    "L2_NORMALIZATION","Y","",""
    "L2_POOL_2D","Y","",""
@@ -29,18 +31,20 @@ Operator lists
    "LSTM","Y","",""
    "MATMUL","","Y",""
    "MAX_POOL_2D","Y","Y",""
-    "MUL","Y","",""
+    "PAD", "N","Y",""
    "PSROI_ALIGN","","Y",""
-    "PRELU","","Y",""
+    "PRELU","","Y","Only caffe model is supported"
    "RELU","Y","Y",""
    "RELU1","Y","Y",""
    "RELU6","Y","Y",""
    "RELUX","","Y",""
-    "RESHAPE","Y","Y","Limited support"
+    "RESHAPE","Y","Y","Limited support: only internal use of reshape in composed operations is supported"
    "RESIZE_BILINEAR","Y","Y",""
    "RNN","Y","",""
    "RPN_PROPOSAL_LAYER","","Y",""
+    "SLICE","N","Y","Only support channel axis slice"
    "SOFTMAX","Y","Y",""
+    "SPACE_TO_BATCH_ND","Y", "Y",""
    "SPACE_TO_DEPTH","Y","Y",""
    "SVDF","Y","",""
    "TANH","Y","Y",""
--- a/docs/getting_started/workflow.jpg
+++ b/docs/getting_started/workflow.jpg
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,7 +11,6 @@ The main documentation is organized into the following sections:

   getting_started/introduction
   getting_started/create_a_model_deployment
-   getting_started/docker
   getting_started/how_to_build
   getting_started/op_lists


--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -15,6 +15,7 @@ load(
    "if_production_mode",
    "if_not_production_mode",
    "if_openmp_enabled",
+    "if_neon_enabled",
 )

 cc_library(
@@ -51,7 +52,11 @@ cc_library(
        "-DMACE_ENABLE_OPENMP",
    ]) + if_android([
        "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]),
    linkopts = ["-ldl"] + if_android([
        "-pie",
        "-lm",

--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -16,6 +16,7 @@
 #define MACE_CORE_ALLOCATOR_H_

 #include <stdlib.h>
+#include <string.h>
 #include <map>
 #include <limits>
 #include <vector>
@@ -42,9 +43,10 @@ class Allocator {
 public:
  Allocator() {}
  virtual ~Allocator() noexcept {}
-  virtual void *New(size_t nbytes) const = 0;
-  virtual void *NewImage(const std::vector<size_t> &image_shape,
-                         const DataType dt) const = 0;
+  virtual MaceStatus New(size_t nbytes, void **result) const = 0;
+  virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                              const DataType dt,
+                              void **result) const = 0;
  virtual void Delete(void *data) const = 0;
  virtual void DeleteImage(void *data) const = 0;
  virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
@@ -53,44 +55,54 @@ class Allocator {
                         std::vector<size_t> *mapped_image_pitch) const = 0;
  virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
  virtual bool OnHost() const = 0;
-
-  template <typename T>
-  T *New(size_t num_elements) {
-    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
-      return nullptr;
-    }
-    void *p = New(sizeof(T) * num_elements);
-    T *typed_p = reinterpret_cast<T *>(p);
-    return typed_p;
-  }
 };

 class CPUAllocator : public Allocator {
 public:
  ~CPUAllocator() override {}
-  void *New(size_t nbytes) const override {
+  MaceStatus New(size_t nbytes, void **result) const override {
    VLOG(3) << "Allocate CPU buffer: " << nbytes;
+    if (nbytes == 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
    void *data = nullptr;
 #if defined(__ANDROID__) || defined(__hexagon__)
    data = memalign(kMaceAlignment, nbytes);
+    if (data == NULL) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #else
-    MACE_CHECK(posix_memalign(&data, kMaceAlignment, nbytes) == 0);
+    int ret = posix_memalign(&data, kMaceAlignment, nbytes);
+    if (ret != 0) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #endif
-    MACE_CHECK_NOTNULL(data);
    // TODO(heliangliang) This should be avoided sometimes
    memset(data, 0, nbytes);
-    return data;
+    *result = data;
+    return MaceStatus::MACE_SUCCESS;
  }

-  void *NewImage(const std::vector<size_t> &shape,
-                 const DataType dt) const override {
+  MaceStatus NewImage(const std::vector<size_t> &shape,
+                      const DataType dt,
+                      void **result) const override {
    MACE_UNUSED(shape);
    MACE_UNUSED(dt);
+    MACE_UNUSED(result);
    LOG(FATAL) << "Allocate CPU image";
-    return nullptr;
+    return MaceStatus::MACE_SUCCESS;
  }

  void Delete(void *data) const override {
+    MACE_CHECK_NOTNULL(data);
    VLOG(3) << "Free CPU buffer";
    free(data);
  }

--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -23,7 +23,8 @@ namespace mace {
 ArgumentHelper::ArgumentHelper(const OperatorDef &def) {
  for (auto &arg : def.arg()) {
    if (arg_map_.find(arg.name()) != arg_map_.end()) {
-      LOG(WARNING) << "Duplicated argument name found in operator def.";
+      LOG(WARNING) << "Duplicated argument name found in operator def: "
+                   << def.name() << " " << arg.name();
    }

    arg_map_[arg.name()] = arg;

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -38,6 +38,11 @@ class BufferBase {

  virtual void *raw_mutable_data() = 0;

+  virtual MaceStatus Allocate(index_t nbytes) = 0;
+
+  virtual MaceStatus Allocate(const std::vector<size_t> &shape,
+                              DataType data_type) = 0;
+
  virtual void *Map(index_t offset,
                    index_t length,
                    std::vector<size_t> *pitch) const = 0;
@@ -48,7 +53,7 @@ class BufferBase {

  virtual void UnMap() = 0;

-  virtual void Resize(index_t size) = 0;
+  virtual MaceStatus Resize(index_t nbytes) = 0;

  virtual void Copy(void *src, index_t offset, index_t length) = 0;

@@ -56,6 +61,8 @@ class BufferBase {

  virtual void Clear() = 0;

+  virtual void Clear(index_t size) = 0;
+
  virtual index_t offset() const { return 0; }

  template <typename T>
@@ -83,14 +90,6 @@ class Buffer : public BufferBase {
        mapped_buf_(nullptr),
        is_data_owner_(true) {}

-  Buffer(Allocator *allocator, index_t size)
-      : BufferBase(size),
-        allocator_(allocator),
-        mapped_buf_(nullptr),
-        is_data_owner_(true) {
-    buf_ = allocator->New(size);
-  }
-
  Buffer(Allocator *allocator, void *data, index_t size)
      : BufferBase(size),
        allocator_(allocator),
@@ -132,6 +131,31 @@ class Buffer : public BufferBase {
    }
  }

+  MaceStatus Allocate(index_t nbytes) {
+    if (nbytes <= 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
+    MACE_CHECK(is_data_owner_,
+               "data is not owned by this buffer, cannot reallocate");
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->Delete(buf_);
+    }
+    size_ = nbytes;
+    return allocator_->New(nbytes, &buf_);
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    if (shape.empty()) return MaceStatus::MACE_SUCCESS;
+    index_t nbytes = std::accumulate(shape.begin(), shape.end(),
+                                     1, std::multiplies<size_t>())
+        * GetEnumTypeSize(data_type);
+    return this->Allocate(nbytes);
+  }
+
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
    MACE_CHECK_NOTNULL(buf_);
    MACE_UNUSED(pitch);
@@ -154,16 +178,17 @@ class Buffer : public BufferBase {
    mapped_buf_ = nullptr;
  }

-  void Resize(index_t size) {
+  MaceStatus Resize(index_t nbytes) {
    MACE_CHECK(is_data_owner_,
               "data is not owned by this buffer, cannot resize");
-    if (size != size_) {
+    if (nbytes != size_) {
      if (buf_ != nullptr) {
        allocator_->Delete(buf_);
      }
-      size_ = size;
-      buf_ = allocator_->New(size);
+      size_ = nbytes;
+      return allocator_->New(nbytes, &buf_);
    }
+    return MaceStatus::MACE_SUCCESS;
  }

  void Copy(void *src, index_t offset, index_t length) {
@@ -175,7 +200,11 @@ class Buffer : public BufferBase {
  bool OnHost() const { return allocator_->OnHost(); }

  void Clear() {
-    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size_);
+    Clear(size_);
+  }
+
+  void Clear(index_t size) {
+    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size);
  }

 protected:
@@ -195,18 +224,6 @@ class Image : public BufferBase {
        buf_(nullptr),
        mapped_buf_(nullptr) {}

-  Image(std::vector<size_t> shape, DataType data_type)
-      : BufferBase(
-            std::accumulate(
-                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
-            GetEnumTypeSize(data_type)),
-        allocator_(GetDeviceAllocator(GPU)),
-        mapped_buf_(nullptr) {
-    shape_ = shape;
-    data_type_ = data_type;
-    buf_ = allocator_->NewImage(shape, data_type);
-  }
-
  virtual ~Image() {
    if (mapped_buf_ != nullptr) {
      UnMap();
@@ -233,6 +250,29 @@ class Image : public BufferBase {

  std::vector<size_t> image_shape() const { return shape_; }

+  MaceStatus Allocate(index_t nbytes) {
+    MACE_UNUSED(nbytes);
+    LOG(FATAL) << "Image should not call this allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    index_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
+        GetEnumTypeSize(data_type);
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->DeleteImage(buf_);
+    }
+    size_ = size;
+    shape_ = shape;
+    data_type_ = data_type;
+    return allocator_->NewImage(shape, data_type, &buf_);
+  }
+
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
    MACE_UNUSED(offset);
    MACE_UNUSED(length);
@@ -259,9 +299,10 @@ class Image : public BufferBase {
    mapped_buf_ = nullptr;
  }

-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
    MACE_UNUSED(size);
    MACE_NOT_IMPLEMENTED;
+    return MaceStatus::MACE_SUCCESS;
  }

  void Copy(void *src, index_t offset, index_t length) {
@@ -277,6 +318,11 @@ class Image : public BufferBase {
    MACE_NOT_IMPLEMENTED;
  }

+  void Clear(index_t size) {
+    MACE_UNUSED(size);
+    MACE_NOT_IMPLEMENTED;
+  }
+
 private:
  Allocator *allocator_;
  std::vector<size_t> shape_;
@@ -339,6 +385,20 @@ class BufferSlice : public BufferBase {
    }
  }

+  MaceStatus Allocate(index_t size) {
+    MACE_UNUSED(size);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    MACE_UNUSED(shape);
+    MACE_UNUSED(data_type);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
    MACE_UNUSED(offset);
    MACE_UNUSED(length);
@@ -364,9 +424,10 @@ class BufferSlice : public BufferBase {
    mapped_buf_ = nullptr;
  }

-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
    MACE_CHECK(size == size_, "resize buffer slice from ", size_,
      " to ", size, " is illegal");
+    return MaceStatus::MACE_SUCCESS;
  }

  void Copy(void *src, index_t offset, index_t length) {
@@ -381,7 +442,11 @@ class BufferSlice : public BufferBase {
  bool OnHost() const { return buffer_->OnHost(); }

  void Clear() {
-    memset(raw_mutable_data(), 0, size_);
+    Clear(size_);
+  }
+
+  void Clear(index_t size) {
+    memset(raw_mutable_data(), 0, size);
  }

 private:
@@ -396,20 +461,17 @@ class ScratchBuffer: public Buffer {
    : Buffer(allocator),
      offset_(0) {}

-  ScratchBuffer(Allocator *allocator, index_t size)
-    : Buffer(allocator, size),
-      offset_(0) {}
-
  ScratchBuffer(Allocator *allocator, void *data, index_t size)
    : Buffer(allocator, data, size),
      offset_(0) {}

  virtual ~ScratchBuffer() {}

-  void GrowSize(index_t size) {
+  MaceStatus GrowSize(index_t size) {
    if (size > size_) {
-      Resize(size);
+      return Resize(size);
    }
+    return MaceStatus::MACE_SUCCESS;
  }

  BufferSlice Scratch(index_t size) {

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -90,13 +90,15 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 // Mace Engine
 class MaceEngine::Impl {
 public:
-  explicit Impl(const NetDef *net_def,
-                DeviceType device_type,
-                const std::vector<std::string> &input_nodes,
-                const std::vector<std::string> &output_nodes,
-                const unsigned char *model_data);
+  explicit Impl(DeviceType device_type);
+
  ~Impl();

+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes,
+                  const unsigned char *model_data);
+
  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                 std::map<std::string, MaceTensor> *outputs,
                 RunMetadata *run_metadata);
@@ -113,11 +115,7 @@ class MaceEngine::Impl {
  DISABLE_COPY_AND_ASSIGN(Impl);
 };

-MaceEngine::Impl::Impl(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes,
-                       const unsigned char *model_data)
+MaceEngine::Impl::Impl(DeviceType device_type)
    : op_registry_(new OperatorRegistry()),
      device_type_(device_type),
      ws_(new Workspace()),
@@ -125,7 +123,13 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
 #ifdef MACE_ENABLE_HEXAGON
      , hexagon_controller_(nullptr)
 #endif
-{
+{}
+
+MaceStatus MaceEngine::Impl::Init(
+    const NetDef *net_def,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const unsigned char *model_data) {
  LOG(INFO) << "MACE version: " << MaceVersion();
  // Set storage path for internal usage
  for (auto input_name : input_nodes) {
@@ -137,7 +141,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
                      GetDeviceAllocator(device_type_), DT_FLOAT);
  }
 #ifdef MACE_ENABLE_HEXAGON
-  if (device_type == HEXAGON) {
+  if (device_type_ == HEXAGON) {
    hexagon_controller_.reset(new HexagonControlWrapper());
    MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
    MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
@@ -153,18 +157,23 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
    }
  } else {
 #endif
-    ws_->LoadModelTensor(*net_def, device_type, model_data);
+    MaceStatus status =
+      ws_->LoadModelTensor(*net_def, device_type_, model_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      return status;
+    }

-    // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
+  // Init model
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
                         NetMode::INIT);
    if (!net->Run()) {
      LOG(FATAL) << "Net init run failed";
    }
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type);
+    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
 #ifdef MACE_ENABLE_HEXAGON
  }
 #endif
+  return MaceStatus::MACE_SUCCESS;
 }

 MaceEngine::Impl::~Impl() {
@@ -254,18 +263,18 @@ MaceStatus MaceEngine::Impl::Run(
  return MACE_SUCCESS;
 }

-MaceEngine::MaceEngine(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes,
-                       const unsigned char *model_data) {
-  impl_ = std::unique_ptr<MaceEngine::Impl>(
-      new MaceEngine::Impl(net_def, device_type, input_nodes, output_nodes,
-                           model_data));
-}
+MaceEngine::MaceEngine(DeviceType device_type):
+    impl_(new MaceEngine::Impl(device_type)) {}

 MaceEngine::~MaceEngine() = default;

+MaceStatus MaceEngine::Init(const NetDef *net_def,
+                            const std::vector<std::string> &input_nodes,
+                            const std::vector<std::string> &output_nodes,
+                            const unsigned char *model_data) {
+  return impl_->Init(net_def, input_nodes, output_nodes, model_data);
+}
+
 MaceStatus MaceEngine::Run(const std::map<std::string, MaceTensor> &inputs,
                           std::map<std::string, MaceTensor> *outputs,
                           RunMetadata *run_metadata) {
@@ -325,17 +334,18 @@ MaceStatus CreateMaceEngineFromPB(const std::string &model_data_file,
                             const_tensor.data_size() *
                             GetEnumTypeSize(const_tensor.data_type())));
  }
+
+  MaceStatus status;
  const unsigned char *model_data = nullptr;
  model_data = LoadModelData(model_data_file, model_data_size);

-  engine->reset(
-      new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes,
-                           model_data));
+  engine->reset(new mace::MaceEngine(device_type));
+  status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data);

  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    UnloadModelData(model_data, model_data_size);
  }
-  return MACE_SUCCESS;
+  return status;
 }

 }  // namespace mace
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,9 +18,11 @@
 #include <omp.h>
 #endif

+#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <string.h>
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -44,7 +46,7 @@ int GetCPUCount() {
    result = access(path, F_OK);
    if (result != 0) {
      if (errno != ENOENT) {
-        LOG(ERROR) << "Access " << path << " failed, errno: " << errno;
+        LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
      }
      return cpu_count;
    }
@@ -81,7 +83,7 @@ void SetThreadAffinity(cpu_set_t mask) {
  pid_t pid = syscall(SYS_gettid);
 #endif
  int err = sched_setaffinity(pid, sizeof(mask), &mask);
-  MACE_CHECK(err == 0, "set affinity error: ", errno);
+  MACE_CHECK(err == 0, "set affinity error: ", strerror(errno));
 }

 }  // namespace
@@ -101,7 +103,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
  for (int i = 0; i < cpu_count; ++i) {
    cpu_max_freq[i] = GetCPUMaxFreq(i);
    if (cpu_max_freq[i] == 0) {
-      LOG(WARNING) << "Cannot get cpu" << i
+      LOG(WARNING) << "Cannot get CPU" << i
                   << "'s max frequency info, maybe it is offline.";
      return MACE_INVALID_ARGS;
    }
@@ -128,13 +130,12 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,

 void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
                                     const std::vector<int> &cpu_ids) {
+#ifdef MACE_ENABLE_OPENMP
  VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
          << ", CPU core IDs: " << MakeString(cpu_ids);
-
-#ifdef MACE_ENABLE_OPENMP
  omp_set_num_threads(omp_num_threads);
 #else
-  LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+  LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif

  // compute mask
@@ -147,11 +148,13 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 #ifdef MACE_ENABLE_OPENMP
 #pragma omp parallel for
  for (int i = 0; i < omp_num_threads; ++i) {
+    VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
+            << "/" << omp_get_num_threads();
    SetThreadAffinity(mask);
  }
 #else
  SetThreadAffinity(mask);
-  LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0];
+  VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
 #endif
 }

@@ -163,7 +166,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
      omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
    }
 #else
-    LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+    LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
    return MACE_SUCCESS;
  }
@@ -192,7 +195,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,

 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
                                 CPUAffinityPolicy policy) {
-  VLOG(1) << "Set CPU openmp num_threads_hint: " << num_threads_hint
+  VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
          << ", affinity policy: " << policy;
  return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy);
 }

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -44,18 +44,30 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 OpenCLAllocator::OpenCLAllocator() {}

 OpenCLAllocator::~OpenCLAllocator() {}
-void *OpenCLAllocator::New(size_t nbytes) const {
+MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
+  if (nbytes == 0) {
+    return MaceStatus::MACE_SUCCESS;
+  }
  VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
  cl_int error;
  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                      nbytes, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
-  return static_cast<void *>(buffer);
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL Buffer with "
+                 << nbytes << " bytes failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = buffer;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }

-void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
-                                const DataType dt) const {
+MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
+                                     const DataType dt,
+                                     void **result) const {
  MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
          << image_shape[1];
@@ -67,11 +79,17 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
      new cl::Image2D(OpenCLRuntime::Global()->context(),
                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                      image_shape[0], image_shape[1], 0, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error) << " with image shape: ["
-                               << image_shape[0] << ", " << image_shape[1]
-                               << "]";
-
-  return cl_image;
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL image with shape: ["
+                 << image_shape[0] << ", " << image_shape[1]
+                 << "] failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = cl_image;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }

 void OpenCLAllocator::Delete(void *buffer) const {

--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -27,15 +27,16 @@ class OpenCLAllocator : public Allocator {

  ~OpenCLAllocator() override;

-  void *New(size_t nbytes) const override;
+  MaceStatus New(size_t nbytes, void **result) const override;

  /*
   * Use Image2D with RGBA (128-bit) format to represent the image.
   *
   * @ shape : [depth, ..., height, width ].
   */
-  void *NewImage(const std::vector<size_t> &image_shape,
-                 const DataType dt) const override;
+  MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                      const DataType dt,
+                      void **result) const override;

  void Delete(void *buffer) const override;


--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -208,7 +208,7 @@ class Tensor {

  inline void Clear() {
    MACE_CHECK_NOTNULL(buffer_);
-    buffer_->Clear();
+    buffer_->Clear(raw_size());
  }

  inline void Reshape(const std::vector<index_t> &shape) {
@@ -216,16 +216,21 @@ class Tensor {
    MACE_CHECK(raw_size() <= buffer_->size());
  }

-  inline void Resize(const std::vector<index_t> &shape) {
+  inline MaceStatus Resize(const std::vector<index_t> &shape) {
    shape_ = shape;
    image_shape_.clear();
    if (buffer_ != nullptr) {
      MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
-      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size())
-        buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+        LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
+                     << raw_size() + EXTRA_BUFFER_PAD_SIZE;
+        return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      }
+      return MaceStatus::MACE_SUCCESS;
    } else {
      MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Buffer(allocator_, raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      buffer_ = new Buffer(allocator_);
+      return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE);
    }
  }

@@ -241,13 +246,14 @@ class Tensor {
    is_buffer_owner_ = false;
  }

-  inline void ResizeImage(const std::vector<index_t> &shape,
-                          const std::vector<size_t> &image_shape) {
+  inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
+                                const std::vector<size_t> &image_shape) {
    shape_ = shape;
    image_shape_ = image_shape;
    if (buffer_ == nullptr) {
      MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image(image_shape, dtype_);
+      buffer_ = new Image();
+      return buffer_->Allocate(image_shape, dtype_);
    } else {
      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
      Image *image = dynamic_cast<Image *>(buffer_);
@@ -257,24 +263,27 @@ class Tensor {
                 "): current physical image shape: ", image->image_shape()[0],
                 ", ", image->image_shape()[1], " < logical image shape: ",
                 image_shape[0], ", ", image_shape[1]);
+      return MaceStatus::MACE_SUCCESS;
    }
  }

-  inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
+  inline MaceStatus ResizeLike(const Tensor &other) {
+    return ResizeLike(&other);
+  }

-  inline void ResizeLike(const Tensor *other) {
+  inline MaceStatus ResizeLike(const Tensor *other) {
    if (other->has_opencl_image()) {
      if (is_buffer_owner_ && buffer_ != nullptr && !has_opencl_image()) {
        delete buffer_;
        buffer_ = nullptr;
      }
-      ResizeImage(other->shape(), other->image_shape_);
+      return ResizeImage(other->shape(), other->image_shape_);
    } else {
      if (is_buffer_owner_ && buffer_ != nullptr && has_opencl_image()) {
        delete buffer_;
        buffer_ = nullptr;
      }
-      Resize(other->shape());
+      return Resize(other->shape());
    }
  }


--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -60,9 +60,9 @@ std::vector<std::string> Workspace::Tensors() const {
  return names;
 }

-void Workspace::LoadModelTensor(const NetDef &net_def,
-                                DeviceType type,
-                                const unsigned char *model_data) {
+MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
+                                      DeviceType type,
+                                      const unsigned char *model_data) {
  MACE_LATENCY_LOGGER(1, "Load model tensors");
  index_t model_data_size = 0;
  for (auto &const_tensor : net_def.tensors()) {
@@ -82,7 +82,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def,
                     model_data_size));
    } else {
      tensor_buffer_ = std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(type), model_data_size));
+          new Buffer(GetDeviceAllocator(type)));
+      MaceStatus status = tensor_buffer_->Allocate(model_data_size);
+      if (status != MaceStatus::MACE_SUCCESS) {
+        return status;
+      }
      tensor_buffer_->Map(nullptr);
      tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
                           0, model_data_size);
@@ -112,14 +116,16 @@ void Workspace::LoadModelTensor(const NetDef &net_def,
  }

  if (type == DeviceType::CPU || type == DeviceType::GPU) {
-    CreateOutputTensorBuffer(net_def, type);
+    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+    if (status != MaceStatus::MACE_SUCCESS) return status;
  }
+  return MaceStatus::MACE_SUCCESS;
 }

-void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                         DeviceType device_type) {
+MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
+                                               DeviceType device_type) {
  if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
-    return;
+    return MaceStatus::MACE_SUCCESS;
  }

  DataType dtype = DataType::DT_INVALID;
@@ -149,14 +155,24 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
      // TODO(liuqi): refactor based on PB
      if (mem_block.mem_id() >= 20000) {
        std::unique_ptr<BufferBase> image_buf(
-            new Image({mem_block.x(), mem_block.y()}, dtype));
+            new Image());
+        MaceStatus status = image_buf->Allocate(
+            {mem_block.x(), mem_block.y()}, dtype);
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(image_buf));
      }
    } else {
      if (mem_block.mem_id() < 20000) {
        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
+            new Buffer(GetDeviceAllocator(device_type)));
+        MaceStatus status = tensor_buf->Allocate(
+            mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE);
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(tensor_buf));
      }
@@ -193,6 +209,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
      }
    }
  }
+  return MaceStatus::MACE_SUCCESS;
 }

 ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -47,14 +47,15 @@ class Workspace {

  std::vector<std::string> Tensors() const;

-  void LoadModelTensor(const NetDef &net_def,
-                       DeviceType type,
-                       const unsigned char *model_data);
+  MaceStatus LoadModelTensor(const NetDef &net_def,
+                             DeviceType type,
+                             const unsigned char *model_data);

  ScratchBuffer *GetScratchBuffer(DeviceType device_type);

 private:
-  void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type);
+  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
+                                      DeviceType device_type);

  TensorMap tensor_map_;


--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,

 extern void Conv2dNeonK3x3S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK3x3S2(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK5x5S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S2(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S3(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 }  // namespace kernels

--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -24,22 +24,22 @@ namespace kernels {
 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 2) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 2) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 1 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,

 void Conv2dNeonK3x3S2(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t c = 0; c < in_channels; ++c) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t c = 0; c < in_shape[1]; ++c) {
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
        const float *in_base = input + b * in_batch_size + c * in_image_size;
        const float
          *filter_ptr = filter + m * in_channels * 9 + c * 9;

--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK5x5S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)

--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S2(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S3(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)

--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
@@ -22,15 +22,9 @@ namespace kernels {

 void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t *in_shape,
+                               const index_t *out_shape,
+                               const int *pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,

 void DepthwiseConv2dNeonK3x3S2(const float *input,
                          const float *filter,
-                          const index_t batch,
-                          const index_t in_height,
-                          const index_t in_width,
-                          const index_t in_channels,
-                          const index_t out_height,
-                          const index_t out_width,
-                          const index_t out_channels,
-                          const int pad_top,
-                          const int pad_left,
+                          const index_t *in_shape,
+                          const index_t *out_shape,
+                          const int *pad_hw,
                          const index_t valid_h_start,
                          const index_t valid_h_stop,
                          const index_t valid_w_start,

--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
 // Ho = 2, Wo = 4, Co = 1
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
  MACE_UNUSED(valid_w_start);
  MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;
      index_t multi_index = m % multiplier;
      const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
      float *out_base = output + b * out_batch_size + m * out_image_size;
      index_t h, w;
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];

      // top
      for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_width; ++w) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,
@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
      }  // h
 #else
      for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
-        for (index_t iw = 0; iw < out_width; ++iw) {
+        for (index_t iw = 0; iw < out_shape[3]; ++iw) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               ih,
@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 #endif

      // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,
@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,

 void DepthwiseConv2dNeonK3x3S2(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
  MACE_UNUSED(valid_w_start);
  MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;
      index_t multi_index = m % multiplier;
      const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
      float *out_base = output + b * out_batch_size + m * out_image_size;
      index_t h, w;
-
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
      // top
      for (h = 0; h < valid_h_start; ++h) {
        for (w = 0; w < out_width; ++w) {
@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
 #endif

      // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -84,49 +84,45 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

  void Conv2dGeneral(const float *input,
                     const float *filter,
-                     const index_t batch,
-                     const index_t in_height,
-                     const index_t in_width,
-                     const index_t in_channels,
-                     const index_t out_height,
-                     const index_t out_width,
-                     const index_t out_channels,
-                     const int filter_height,
-                     const int filter_width,
-                     const int stride_h,
-                     const int stride_w,
-                     const int dilation_h,
-                     const int dilation_w,
+                     const index_t *in_shape,
+                     const index_t *out_shape,
+                     const index_t *filter_shape,
+                     const int *stride_hw,
+                     const int *dilation_hw,
                     float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = in_channels * in_image_size;
-    const index_t out_batch_size = out_channels * out_image_size;
-    const index_t filter_size = filter_height * filter_width;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = filter_shape[1] * in_image_size;
+    const index_t out_batch_size = filter_shape[0] * out_image_size;
+    const index_t filter_size = filter_shape[2] * filter_shape[3];

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; m += 4) {
+    for (index_t b = 0; b < in_shape[0]; b++) {
+      for (index_t m = 0; m < filter_shape[0]; m += 4) {
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t out_channels = filter_shape[0];
+        const index_t in_channels = filter_shape[1];
+
+        const int stride_h = stride_hw[0];
+        const int stride_w = stride_hw[1];
+        const int dilation_h = dilation_hw[0];
+        const int dilation_w = dilation_hw[1];
        if (m + 3 < out_channels) {
          float *out_ptr0_base =
              output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base =
-              output + b * out_batch_size + (m + 1) * out_image_size;
-          float *out_ptr2_base =
-              output + b * out_batch_size + (m + 2) * out_image_size;
-          float *out_ptr3_base =
-              output + b * out_batch_size + (m + 3) * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
                input + b * in_batch_size + c * in_image_size;
            const float *filter_ptr0 =
                filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 =
-                filter + (m + 1) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr2 =
-                filter + (m + 2) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr3 =
-                filter + (m + 3) * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -144,8 +140,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   vo3[ow] = out_ptr3_base[out_offset + ow];
                }
                // calc by row
-                for (index_t kh = 0; kh < filter_height; ++kh) {
-                  for (index_t kw = 0; kw < filter_width; ++kw) {
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                    // outch 0
                    vo0[0] += in_ptr_base[in_offset
                        + kw * dilation_w] * filter_ptr0[kw];
@@ -185,10 +181,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                  }  // kw

                  in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_width;
-                  filter_ptr1 += filter_width;
-                  filter_ptr2 += filter_width;
-                  filter_ptr3 += filter_width;
+                  filter_ptr0 += filter_shape[3];
+                  filter_ptr1 += filter_shape[3];
+                  filter_ptr2 += filter_shape[3];
+                  filter_ptr3 += filter_shape[3];
                }  // kh

                for (index_t ow = 0; ow < 4; ++ow) {
@@ -230,8 +226,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                  }

                  // calc by row
-                  for (index_t kh = 0; kh < filter_height; ++kh) {
-                    for (index_t kw = 0; kw < filter_width; ++kw) {
+                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                      // outch 0
                      vo0[0] += in_ptr_base[in_offset
                          + kw * dilation_w] * filter_ptr0[kw];
@@ -244,7 +240,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                    }  // kw

                    in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_width;
+                    filter_ptr0 += filter_shape[3];
                  }  // kh

                  for (index_t ow = 0; ow < 4; ++ow) {
@@ -301,7 +297,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                         output_shape.data());
    }
    output->Resize(output_shape);
-    output->Clear();

    index_t batch = output->dim(0);
    index_t channels = output->dim(1);
@@ -419,7 +414,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      if (extra_input_width != padded_input_width) {
        pad_right += (extra_input_width - padded_input_width);
      }
-    } else {
+    } else if (!use_neon_1x1_s1) {
      extra_output_height = height;
      extra_input_height =
          std::max(padded_input_height, (extra_output_height - 1) * stride_h
@@ -478,6 +473,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
    Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
    Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+    const index_t extra_input_shape[4] =
+        {batch, input_channels, extra_input_height, extra_input_width};
+    const index_t extra_output_shape[4] =
+        {batch, channels, extra_output_height, extra_output_width};

    // decide which convolution function to call
    if (use_winograd) {
@@ -512,6 +511,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

      float *transformed_input_data = transformed_input.mutable_data<float>();
      float *transformed_output_data = transformed_output.mutable_data<float>();
+
      conv_func = [=](const float *pad_input, float *pad_output) {
        WinoGradConv3x3s1(pad_input,
                          transformed_filter_ptr,
@@ -529,26 +529,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK3x3S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_3x3_s2) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK3x3S2(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_1x1_s1) {
@@ -566,71 +556,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK5x5S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s2) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S2(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s3) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S3(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dGeneral(pad_input,
                      filter_data,
-                      batch,
-                      extra_input_height,
-                      extra_input_width,
-                      input_channels,
-                      extra_output_height,
-                      extra_output_width,
-                      channels,
-                      filter_h,
-                      filter_w,
-                      stride_h,
-                      stride_w,
-                      dilation_h,
-                      dilation_w,
+                      extra_input_shape,
+                      extra_output_shape,
+                      filter_shape.data(),
+                      strides_,
+                      dilations_,
                      pad_output);
      };
    }
@@ -639,7 +601,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    const Tensor *pad_input_ptr = input;
    if (extra_input_height != input_height
      || extra_input_width != input_width) {
-      padded_input.Clear();
      ConstructNCHWInputWithSpecificPadding(input,
                                            pad_top,
                                            pad_bottom,
@@ -649,13 +610,17 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      pad_input_ptr = &padded_input;
    }

+    // TODO(libin): don't need clear after bias is integrated in each conv
    Tensor *pad_output_ptr = output;
    if (extra_output_height != height || extra_output_width != width) {
      padded_output.Reshape({batch, channels, extra_output_height,
                            extra_output_width});
      padded_output.Clear();
      pad_output_ptr = &padded_output;
+    } else if (!use_neon_1x1_s1) {
+      output->Clear();
    }
+
    const float *pad_input_data = pad_input_ptr->data<float>();
    float *pad_output_data = pad_output_ptr->mutable_data<float>();


--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -377,6 +377,7 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
  std::vector<index_t> output_shape(
    {batch, channels, height + pad_height, width + pad_width});
  output_tensor->Resize(output_shape);
+  output_tensor->Clear();
  Tensor::MappingGuard padded_output_mapper(output_tensor);
  float *output_data = output_tensor->mutable_data<float>();


--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -41,48 +41,40 @@ template<typename T>
 void Deconv2dNCHW(const T *input,
                  const T *filter,
                  const T *bias,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const index_t out_channels,
-                  const index_t filter_height,
-                  const index_t filter_width,
-                  const index_t stride_h,
-                  const index_t stride_w,
-                  const int padding_top,
-                  const int padding_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t *kernel_hw,
+                  const int *strides,
+                  const int *padding,
                  float *output) {
 #pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t oc = 0; oc < out_channels; ++oc) {
-      for (index_t oh = 0; oh < out_height; ++oh) {
-        for (index_t ow = 0; ow < out_width; ++ow) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < out_shape[1]; ++oc) {
+      for (index_t oh = 0; oh < out_shape[2]; ++oh) {
+        for (index_t ow = 0; ow < out_shape[3]; ++ow) {
          index_t filter_start_y, filter_start_x;
-          index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left);
-          index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top);
-          start_x /= stride_w;
-          start_y /= stride_h;
-          filter_start_x = padding_left + stride_w * start_x - ow;
-          filter_start_y = padding_top + stride_h * start_y - oh;
-          filter_start_x = filter_width - 1 - filter_start_x;
-          filter_start_y = filter_height - 1 - filter_start_y;
+          index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
+          index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
+          start_x /= strides[1];
+          start_y /= strides[0];
+          filter_start_x = padding[1] + strides[1] * start_x - ow;
+          filter_start_y = padding[0] + strides[0] * start_y - oh;
+          filter_start_x = kernel_hw[1] - 1 - filter_start_x;
+          filter_start_y = kernel_hw[0] - 1 - filter_start_y;
          T out_value = 0;
          index_t out_pos =
-              ((b * out_channels + oc) * out_height + oh) * out_width + ow;
-          for (index_t ic = 0; ic < in_channels; ++ic) {
+              ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
+          for (index_t ic = 0; ic < in_shape[1]; ++ic) {
            for (index_t f_y = filter_start_y, ih = start_y;
-                 f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) {
+                 f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
              for (index_t f_x = filter_start_x, iw = start_x;
-                  f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) {
+                  f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
                  index_t weight_pos =
-                      ((oc * in_channels + ic) * filter_height + f_y)
-                          * filter_width + f_x;
+                      ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
+                          * kernel_hw[1] + f_x;
                  index_t in_pos =
-                      ((b * in_channels + ic) * in_height + ih)
-                          * in_width + iw;
+                      ((b * in_shape[1] + ic) * in_shape[2] + ih)
+                          * in_shape[3] + iw;
                  out_value += input[in_pos] * filter[weight_pos];
              }
            }
@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                           paddings_.data(), true);
      output->Resize(output_shape_);
    }
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-
    index_t kernel_h = filter->dim(2);
    index_t kernel_w = filter->dim(3);
-    MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-    MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-               input_channels);
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
+    const index_t *in_shape = input->shape().data();
+    const index_t *out_shape = output->shape().data();
+    const index_t kernel_hw[2] = {kernel_h, kernel_w};
+
+    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
+    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
+               in_shape[1]);
+    MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
    auto filter_data = filter->data<T>();
    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
    auto output_data = output->mutable_data<T>();
-    int padding_top = (paddings_[0] + 1) >> 1;
-    int padding_left = (paddings_[1] + 1) >> 1;
-
-    deconv::Deconv2dNCHW(input_data, filter_data, bias_data,
-                         batch, input_height, input_width, input_channels,
-                         height, width, channels,
-                         kernel_h, kernel_w,
-                         stride_h, stride_w, padding_top, padding_left,
+    int padding[2];
+    padding[0] = (paddings_[0] + 1) >> 1;
+    padding[1] = (paddings_[1] + 1) >> 1;
+    deconv::Deconv2dNCHW(input_data,
+                         filter_data,
+                         bias_data,
+                         in_shape,
+                         out_shape,
+                         kernel_hw,
+                         strides_,
+                         padding,
                         output_data);

-    DoActivation(output_data, output_data, output->size(), activation_,
+    DoActivation(output_data,
+                 output_data,
+                 output->size(),
+                 activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    MACE_UNUSED(future);
-    const int batch_size = input->dim(0);
-    const int input_depth = input->dim(1);
-    const int input_height = input->dim(2);
-    const int input_width = input->dim(3);
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(1);
+    const index_t input_height = input->dim(2);
+    const index_t input_width = input->dim(3);

    index_t output_depth, output_width, output_height;

@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {

    if (d2s_) {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < output_depth; ++d) {
-          for (int h = 0; h < output_height; ++h) {
-            const int in_h = h / block_size_;
-            const int offset_h = (h % block_size_);
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < output_depth; ++d) {
+          for (index_t h = 0; h < output_height; ++h) {
+            const index_t in_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
            for (int w = 0; w < output_width; ++w) {
              const index_t in_w = w / block_size_;
              const index_t offset_w = w % block_size_;
@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
      }
    } else {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < input_depth; ++d) {
-          for (int h = 0; h < input_height; ++h) {
-            const int out_h = h / block_size_;
-            const int offset_h = (h % block_size_);
-            for (int w = 0; w < input_width; ++w) {
-              const int out_w = w / block_size_;
-              const int offset_w = (w % block_size_);
-              const int offset_d =
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < input_depth; ++d) {
+          for (index_t h = 0; h < input_height; ++h) {
+            const index_t out_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
+            for (index_t w = 0; w < input_width; ++w) {
+              const index_t out_w = w / block_size_;
+              const index_t offset_w = (w % block_size_);
+              const index_t offset_d =
                (offset_h * block_size_ + offset_w) * input_depth;

-              const int out_d = d + offset_d;
+              const index_t out_d = d + offset_d;
              const index_t o_index =
                ((b * output_depth + out_d) * output_height + out_h)
                  * output_width + out_w;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>

  void DepthwiseConv2dGeneral(const float *input,
                              const float *filter,
-                              const index_t batch,
-                              const index_t in_height,
-                              const index_t in_width,
-                              const index_t in_channels,
-                              const index_t out_height,
-                              const index_t out_width,
-                              const index_t out_channels,
-                              const int filter_height,
-                              const int filter_width,
-                              const int stride_h,
-                              const int stride_w,
-                              const int dilation_h,
-                              const int dilation_w,
-                              const int pad_top,
-                              const int pad_left,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              const index_t *filter_shape,
+                              const int *stride_hw,
+                              const int *dilation_hw,
+                              const int *pad_hw,
                              float *output) {
-    const index_t multiplier = out_channels / in_channels;
+    const index_t multiplier = filter_shape[0] / filter_shape[1];
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; ++m) {
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
+    for (index_t b = 0; b < in_shape[0]; ++b) {
+      for (index_t m = 0; m < filter_shape[0]; ++m) {
+        for (index_t h = 0; h < out_shape[2]; ++h) {
+          for (index_t w = 0; w < out_shape[3]; ++w) {
+            const index_t out_channels = filter_shape[0];
+            const index_t in_channels = filter_shape[1];
+            const index_t filter_height = filter_shape[2];
+            const index_t filter_width = filter_shape[3];
+            const index_t in_height = in_shape[2];
+            const index_t in_width = in_shape[3];
+            const index_t out_height = out_shape[2];
+            const index_t out_width = out_shape[3];
            index_t out_offset =
              ((b * out_channels + m) * out_height + h) * out_width + w;
            index_t c = m / multiplier;
@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
            float sum = 0;
            for (index_t kh = 0; kh < filter_height; ++kh) {
              for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_h + kh * dilation_h - pad_top;
-                index_t iw = w * stride_w + kw * dilation_w - pad_left;
+                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
+                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                  index_t in_offset =
                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
    auto output_data = output->mutable_data<float>();

+    const int pad_hw[2] = {pad_top, pad_left};
+    const index_t input_shape[4] =
+        {batch, input_channels, input_height, input_width};
+
    if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
      && dilation_h == 1 && dilation_w == 1) {
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dNeonK3x3S1(input,
                                  filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                  valid_h_start,
                                  valid_h_stop,
                                  valid_w_start,
@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dNeonK3x3S2(input,
                                  filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                  valid_h_start,
                                  valid_h_stop,
                                  valid_w_start,
@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dGeneral(input,
                               filter_data,
-                               batch,
-                               input_height,
-                               input_width,
-                               input_channels,
-                               height,
-                               width,
-                               channels,
-                               filter_h,
-                               filter_w,
-                               stride_h,
-                               stride_w,
-                               dilation_h,
-                               dilation_w,
-                               pad_top,
-                               pad_left,
+                               input_shape,
+                               output_shape.data(),
+                               filter_shape.data(),
+                               strides_,
+                               dilations_,
+                               pad_hw,
                               output);
      };
    }

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -140,8 +140,8 @@ inline void GemmTile(const float *A,
 #endif

 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-  for (h = 0; h + 7 < height; h += 8) {
-    for (k = 0; k + 7 < K; k += 8) {
+  for (h = 0; h < height - 7; h += 8) {
+    for (k = 0; k < K - 7; k += 8) {
      const float *a_ptr = A + (h * stride_k + k);
 #ifdef __clang__
      int nw = width >> 2;
@@ -185,156 +185,150 @@ inline void GemmTile(const float *A,
        float *c_ptr7 = C + (h + 7) * stride_w;

        asm volatile(
-        "0:                                 \n"
-
-          "prfm   pldl1keep, [%1, #128]       \n"
-          "ld1    {v24.4s}, [%1]              \n"
-
-          // load b: 0-7
-          "prfm   pldl1keep, [%9, #128]       \n"
-          "ld1    {v16.4s}, [%9], #16         \n"
-
-          "prfm   pldl1keep, [%10, #128]      \n"
-          "ld1    {v17.4s}, [%10], #16        \n"
-
-          "prfm   pldl1keep, [%11, #128]      \n"
-          "ld1    {v18.4s}, [%11], #16        \n"
-
-          "prfm   pldl1keep, [%12, #128]      \n"
-          "ld1    {v19.4s}, [%12], #16        \n"
-
-          "prfm   pldl1keep, [%2, #128]       \n"
-          "ld1    {v25.4s}, [%2]              \n"
-
-          "prfm   pldl1keep, [%13, #128]      \n"
-          "ld1    {v20.4s}, [%13], #16        \n"
-
-          "prfm   pldl1keep, [%14, #128]      \n"
-          "ld1    {v21.4s}, [%14], #16        \n"
-
-          "prfm   pldl1keep, [%15, #128]      \n"
-          "ld1    {v22.4s}, [%15], #16        \n"
-
-          "prfm   pldl1keep, [%16, #128]      \n"
-          "ld1    {v23.4s}, [%16], #16        \n"
-
-          "prfm   pldl1keep, [%3, #128]       \n"
-          "ld1    {v26.4s}, [%3]              \n"
-
-          "fmla   v24.4s, v16.4s, %34.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %34.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %34.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %34.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %35.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %35.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %35.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %35.s[3]    \n"
-
-          "st1    {v24.4s}, [%1], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %36.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %36.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %36.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %36.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %37.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %37.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %37.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %37.s[3]    \n"
-
-          "prfm   pldl1keep, [%4, #128]       \n"
-          "ld1    {v24.4s}, [%4]              \n"
-
-          "st1    {v25.4s}, [%2], #16         \n"
-
-          "fmla   v26.4s, v16.4s, %38.s[0]    \n"
-          "fmla   v26.4s, v17.4s, %38.s[1]    \n"
-          "fmla   v26.4s, v18.4s, %38.s[2]    \n"
-          "fmla   v26.4s, v19.4s, %38.s[3]    \n"
+        "prfm   pldl1keep, [%9, #128]       \n"
+        "ld1    {v16.4s}, [%9], #16         \n"

-          "fmla   v26.4s, v20.4s, %39.s[0]    \n"
-          "fmla   v26.4s, v21.4s, %39.s[1]    \n"
-          "fmla   v26.4s, v22.4s, %39.s[2]    \n"
-          "fmla   v26.4s, v23.4s, %39.s[3]    \n"
+        "prfm   pldl1keep, [%1, #128]       \n"
+        "ld1    {v18.4s}, [%1]              \n"

-          "prfm   pldl1keep, [%5, #128]       \n"
-          "ld1    {v25.4s}, [%5]              \n"
+        "prfm   pldl1keep, [%2, #128]       \n"
+        "ld1    {v19.4s}, [%2]              \n"

-          "st1    {v26.4s}, [%3], #16         \n"
-
-          "fmla   v24.4s, v16.4s, %40.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %40.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %40.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %40.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %41.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %41.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %41.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %41.s[3]    \n"
-
-          "prfm   pldl1keep, [%6, #128]       \n"
-          "ld1    {v26.4s}, [%6]              \n"
-
-          "st1    {v24.4s}, [%4], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %42.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %42.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %42.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %42.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %43.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %43.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %43.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %43.s[3]    \n"
-
-          "prfm   pldl1keep, [%7, #128]       \n"
-          "ld1    {v24.4s}, [%7]              \n"
-
-          "st1    {v25.4s}, [%5], #16         \n"
-
-          "fmla   v26.4s, v16.4s, %44.s[0]    \n"
-          "fmla   v26.4s, v17.4s, %44.s[1]    \n"
-          "fmla   v26.4s, v18.4s, %44.s[2]    \n"
-          "fmla   v26.4s, v19.4s, %44.s[3]    \n"
-
-          "fmla   v26.4s, v20.4s, %45.s[0]    \n"
-          "fmla   v26.4s, v21.4s, %45.s[1]    \n"
-          "fmla   v26.4s, v22.4s, %45.s[2]    \n"
-          "fmla   v26.4s, v23.4s, %45.s[3]    \n"
-
-          "prfm   pldl1keep, [%8, #128]       \n"
-          "ld1    {v25.4s}, [%8]              \n"
-
-          "st1    {v26.4s}, [%6], #16         \n"
-
-          "fmla   v24.4s, v16.4s, %46.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %46.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %46.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %46.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %47.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %47.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %47.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %47.s[3]    \n"
-
-          "st1    {v24.4s}, [%7], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %48.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %48.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %48.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %48.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %49.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %49.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %49.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %49.s[3]    \n"
-
-          "st1    {v25.4s}, [%8], #16         \n"
+        "0:                                 \n"

-          "subs   %w0, %w0, #1                \n"
-          "bne    0b                          \n"
-        : "=r"(nw),      // 0
+        "prfm   pldl1keep, [%3, #128]       \n"
+        "ld1    {v20.4s}, [%3]              \n"
+        "prfm   pldl1keep, [%4, #128]       \n"
+        "ld1    {v21.4s}, [%4]              \n"
+        "prfm   pldl1keep, [%5, #128]       \n"
+        "ld1    {v22.4s}, [%5]              \n"
+        "prfm   pldl1keep, [%6, #128]       \n"
+        "ld1    {v23.4s}, [%6]              \n"
+        "prfm   pldl1keep, [%7, #128]       \n"
+        "ld1    {v24.4s}, [%7]              \n"
+        "prfm   pldl1keep, [%8, #128]       \n"
+        "ld1    {v25.4s}, [%8]              \n"
+        "prfm   pldl1keep, [%10, #128]      \n"
+        "ld1    {v17.4s}, [%10], #16        \n"
+
+        "fmla   v18.4s, v16.4s, %34.s[0]    \n"
+        "fmla   v19.4s, v16.4s, %35.s[0]    \n"
+        "fmla   v20.4s, v16.4s, %36.s[0]    \n"
+        "fmla   v21.4s, v16.4s, %37.s[0]    \n"
+
+        "fmla   v22.4s, v16.4s, %38.s[0]    \n"
+        "fmla   v23.4s, v16.4s, %39.s[0]    \n"
+        "fmla   v24.4s, v16.4s, %40.s[0]    \n"
+        "fmla   v25.4s, v16.4s, %41.s[0]    \n"
+
+        "fmla   v18.4s, v17.4s, %34.s[1]    \n"
+        "fmla   v19.4s, v17.4s, %35.s[1]    \n"
+        "fmla   v20.4s, v17.4s, %36.s[1]    \n"
+        "fmla   v21.4s, v17.4s, %37.s[1]    \n"
+
+        "prfm   pldl1keep, [%11, #128]      \n"
+        "ld1    {v16.4s}, [%11], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %38.s[1]    \n"
+        "fmla   v23.4s, v17.4s, %39.s[1]    \n"
+        "fmla   v24.4s, v17.4s, %40.s[1]    \n"
+        "fmla   v25.4s, v17.4s, %41.s[1]    \n"
+
+        "fmla   v18.4s, v16.4s, %34.s[2]    \n"
+        "fmla   v19.4s, v16.4s, %35.s[2]    \n"
+        "fmla   v20.4s, v16.4s, %36.s[2]    \n"
+        "fmla   v21.4s, v16.4s, %37.s[2]    \n"
+
+        "prfm   pldl1keep, [%12, #128]      \n"
+        "ld1    {v17.4s}, [%12], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %38.s[2]    \n"
+        "fmla   v23.4s, v16.4s, %39.s[2]    \n"
+        "fmla   v24.4s, v16.4s, %40.s[2]    \n"
+        "fmla   v25.4s, v16.4s, %41.s[2]    \n"
+
+        "fmla   v18.4s, v17.4s, %34.s[3]    \n"
+        "fmla   v19.4s, v17.4s, %35.s[3]    \n"
+        "fmla   v20.4s, v17.4s, %36.s[3]    \n"
+        "fmla   v21.4s, v17.4s, %37.s[3]    \n"
+
+        "prfm   pldl1keep, [%13, #128]      \n"
+        "ld1    {v16.4s}, [%13], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %38.s[3]    \n"
+        "fmla   v23.4s, v17.4s, %39.s[3]    \n"
+        "fmla   v24.4s, v17.4s, %40.s[3]    \n"
+        "fmla   v25.4s, v17.4s, %41.s[3]    \n"
+
+        "fmla   v18.4s, v16.4s, %42.s[0]    \n"
+        "fmla   v19.4s, v16.4s, %43.s[0]    \n"
+        "fmla   v20.4s, v16.4s, %44.s[0]    \n"
+        "fmla   v21.4s, v16.4s, %45.s[0]    \n"
+
+        "prfm   pldl1keep, [%14, #128]      \n"
+        "ld1    {v17.4s}, [%14], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %46.s[0]    \n"
+        "fmla   v23.4s, v16.4s, %47.s[0]    \n"
+        "fmla   v24.4s, v16.4s, %48.s[0]    \n"
+        "fmla   v25.4s, v16.4s, %49.s[0]    \n"
+
+        "fmla   v18.4s, v17.4s, %42.s[1]    \n"
+        "fmla   v19.4s, v17.4s, %43.s[1]    \n"
+        "fmla   v20.4s, v17.4s, %44.s[1]    \n"
+        "fmla   v21.4s, v17.4s, %45.s[1]    \n"
+
+        "prfm   pldl1keep, [%15, #128]      \n"
+        "ld1    {v16.4s}, [%15], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %46.s[1]    \n"
+        "fmla   v23.4s, v17.4s, %47.s[1]    \n"
+        "fmla   v24.4s, v17.4s, %48.s[1]    \n"
+        "fmla   v25.4s, v17.4s, %49.s[1]    \n"
+
+        "fmla   v18.4s, v16.4s, %42.s[2]    \n"
+        "fmla   v19.4s, v16.4s, %43.s[2]    \n"
+        "fmla   v20.4s, v16.4s, %44.s[2]    \n"
+        "fmla   v21.4s, v16.4s, %45.s[2]    \n"
+
+        "prfm   pldl1keep, [%16, #128]      \n"
+        "ld1    {v17.4s}, [%16], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %46.s[2]    \n"
+        "fmla   v23.4s, v16.4s, %47.s[2]    \n"
+        "fmla   v24.4s, v16.4s, %48.s[2]    \n"
+        "fmla   v25.4s, v16.4s, %49.s[2]    \n"
+
+        "fmla   v18.4s, v17.4s, %42.s[3]    \n"
+        "fmla   v19.4s, v17.4s, %43.s[3]    \n"
+        "fmla   v20.4s, v17.4s, %44.s[3]    \n"
+        "fmla   v21.4s, v17.4s, %45.s[3]    \n"
+
+        "st1    {v18.4s}, [%1], #16         \n"
+        "st1    {v19.4s}, [%2], #16         \n"
+        "st1    {v20.4s}, [%3], #16         \n"
+        "st1    {v21.4s}, [%4], #16         \n"
+
+        "fmla   v22.4s, v17.4s, %46.s[3]    \n"
+        "fmla   v23.4s, v17.4s, %47.s[3]    \n"
+        "fmla   v24.4s, v17.4s, %48.s[3]    \n"
+        "fmla   v25.4s, v17.4s, %49.s[3]    \n"
+
+        "st1    {v22.4s}, [%5], #16         \n"
+        "st1    {v23.4s}, [%6], #16         \n"
+        "st1    {v24.4s}, [%7], #16         \n"
+        "st1    {v25.4s}, [%8], #16         \n"
+
+        "prfm   pldl1keep, [%9, #128]       \n"
+        "ld1    {v16.4s}, [%9], #16         \n"
+        "prfm   pldl1keep, [%1, #128]       \n"
+        "ld1    {v18.4s}, [%1]              \n"
+        "prfm   pldl1keep, [%2, #128]       \n"
+        "ld1    {v19.4s}, [%2]              \n"
+
+        "subs   %w0, %w0, #1                \n"
+        "bne    0b                          \n"
+        : "=r"(nw),    // 0
        "=r"(c_ptr0),  // 1
        "=r"(c_ptr1),  // 2
        "=r"(c_ptr2),  // 3
@@ -351,7 +345,7 @@ inline void GemmTile(const float *A,
        "=r"(b_ptr5),  // 14
        "=r"(b_ptr6),  // 15
        "=r"(b_ptr7)   // 16
-        : "0"(nw),       // 17
+        : "0"(nw),     // 17
        "1"(c_ptr0),   // 18
        "2"(c_ptr1),   // 19
        "3"(c_ptr2),   // 20
@@ -369,20 +363,20 @@ inline void GemmTile(const float *A,
        "15"(b_ptr6),  // 32
        "16"(b_ptr7),  // 33
        "w"(a0),       // 34
-        "w"(a1),       // 35
-        "w"(a2),       // 36
-        "w"(a3),       // 37
-        "w"(a4),       // 38
-        "w"(a5),       // 39
-        "w"(a6),       // 40
-        "w"(a7),       // 41
-        "w"(a8),       // 42
-        "w"(a9),       // 43
-        "w"(a10),      // 44
-        "w"(a11),      // 45
-        "w"(a12),      // 46
-        "w"(a13),      // 47
-        "w"(a14),      // 48
+        "w"(a2),       // 35
+        "w"(a4),       // 36
+        "w"(a6),       // 37
+        "w"(a8),       // 38
+        "w"(a10),      // 39
+        "w"(a12),      // 40
+        "w"(a14),      // 41
+        "w"(a1),       // 42
+        "w"(a3),       // 43
+        "w"(a5),       // 44
+        "w"(a7),       // 45
+        "w"(a9),       // 46
+        "w"(a11),      // 47
+        "w"(a13),      // 48
        "w"(a15)       // 49
        : "cc", "memory",
        "v16",
@@ -585,7 +579,6 @@ void Gemm(const float *A,
  }
  memset(C, 0, sizeof(float) * batch * height * width);

-
  // It is better to use large block size if it fits for fast cache.
  // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
  // the block size should be sqrt(32k / sizeof(T) / 3).

--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -45,7 +45,8 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -58,7 +58,8 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -56,7 +56,8 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -49,7 +49,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -90,7 +91,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  } else {
    std::vector<uint32_t> roundup_gws(lws.size());
    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+      if (lws[i] != 0)
+        roundup_gws[i] = RoundUp(gws[i], lws[i]);
    }

    error = runtime->command_queue().enqueueNDRangeKernel(

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -93,7 +93,8 @@ void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    if (!kernel_error_) {
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -56,7 +56,8 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -67,7 +67,8 @@ static void Concat2(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -148,7 +149,8 @@ static void ConcatN(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -100,7 +100,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -86,7 +86,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -94,7 +94,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -65,7 +65,8 @@ void Deconv2dOpencl(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -86,7 +86,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -97,7 +97,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -97,7 +97,8 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -74,7 +74,8 @@ void FCWXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -200,7 +201,8 @@ void FCWTXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -86,7 +86,8 @@ void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    if (!kernel_error_) {
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -54,7 +54,8 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -57,7 +57,8 @@ bool BufferToImageOpImpl(Tensor *buffer,
  if (runtime->IsOutOfRangeCheckEnabled()) {
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+        new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+    kernel_error->Allocate(1);
    kernel_error->Map(nullptr);
    *(kernel_error->mutable_data<char>()) = 0;
    kernel_error->UnMap();

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -60,7 +60,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -72,7 +72,8 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -78,7 +78,8 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -51,7 +51,8 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -70,7 +70,8 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -70,7 +70,8 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -39,7 +39,8 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -138,7 +139,8 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  }

  void MaxPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                  float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+
        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w < out_width; ++w) {
            const index_t out_offset = out_base + h * out_width + w;
            float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                  index_t input_offset = in_base + inh * in_width + inw;
                  res = std::max(res, input[input_offset]);
@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  }

  void AvgPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                  float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w < out_width; ++w) {
            const index_t out_offset = out_base + h * out_width + w;
            float res = 0;
            int block_size = 0;
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                  index_t input_offset = in_base + inh * in_width + inw;
                  res += input[input_offset];
@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
    const float *input = input_tensor->data<float>();
    float *output = output_tensor->mutable_data<float>();
    const index_t *input_shape = input_tensor->shape().data();
-    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
-
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
-
-    int filter_h = kernels_[0];
-    int filter_w = kernels_[1];
-
-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
-
-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
-
-    int pad_top = paddings[0] / 2;
-    int pad_left = paddings[1] / 2;
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};

    if (pooling_type_ == PoolingType::MAX) {
      MaxPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                 output);
    } else if (pooling_type_ == PoolingType::AVG) {
      AvgPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                 output);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
    const Tensor *input_tensor = this->Input(0);
    const Tensor *alpha_tensor =
        this->InputSize() >= 2 ? this->Input(1) : nullptr;
-    Tensor *output_tensor = this->outputs_[0];
+    Tensor *output_tensor = this->Output(0);
    output_tensor->ResizeLike(input_tensor);

    functor_(input_tensor, alpha_tensor, output_tensor, future);

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -620,6 +620,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
  static void Near(const Tensor &x, const Tensor &y,
                   const double rel_err,
                   const double abs_err) {
+    MACE_UNUSED(rel_err);
+    MACE_UNUSED(abs_err);
    Equal(x, y);
  }
 };

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -56,7 +56,11 @@ class RunMetadata {

 const char *MaceVersion();

-enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
+enum MaceStatus {
+  MACE_SUCCESS = 0,
+  MACE_INVALID_ARGS = 1,
+  MACE_OUT_OF_RESOURCES = 2
+};

 // MACE input/output tensor
 class MaceTensor {
@@ -84,13 +88,14 @@ class MaceTensor {

 class MaceEngine {
 public:
-  explicit MaceEngine(const NetDef *net_def,
-                      DeviceType device_type,
-                      const std::vector<std::string> &input_nodes,
-                      const std::vector<std::string> &output_nodes,
-                      const unsigned char *model_data);
+  explicit MaceEngine(DeviceType device_type);
  ~MaceEngine();

+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes,
+                  const unsigned char *model_data);
+
  MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                 std::map<std::string, MaceTensor> *outputs);


--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -128,7 +128,7 @@ def main(unused_args):
                                                       FLAGS.weight_file)

        output_graph_def = converter.run()
-        print("Transform model to one that can better run on device.")
+        print("Transform model to one that can better run on device")
        if not FLAGS.runtime:
            cpu_graph_def = copy.deepcopy(output_graph_def)
            option.device = mace_pb2.CPU

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -136,23 +136,25 @@ class MaceKeyword(object):


 class TransformerRule(Enum):
-    REMOVE_IDENTITY_OP = 0
-    TRANSFORM_GLOBAL_POOLING = 1
-    FOLD_SOFTMAX = 2
-    FOLD_BATCHNORM = 3,
-    FOLD_CONV_AND_BN = 4,
-    FOLD_DEPTHWISE_CONV_AND_BN = 5,
-    TRANSFORM_GPU_WINOGRAD = 6,
-    TRANSFORM_ADD_TO_BIASADD = 7,
-    FOLD_BIASADD = 8,
-    FOLD_ACTIVATION = 9,
-    TRANSPOSE_FILTERS = 10,
-    RESHAPE_FC_WEIGHT = 11,
-    TRANSPOSE_DATA_FORMAT = 12,
-    TRANSFORM_GLOBAL_CONV_TO_FC = 13,
-    TRANSFORM_BUFFER_IMAGE = 14,
-    ADD_DEVICE_AND_DATA_TYPE = 15,
-    SORT_BY_EXECUTION = 16
+    REMOVE_USELESS_RESHAPE_OP = 0
+    REMOVE_IDENTITY_OP = 1
+    TRANSFORM_GLOBAL_POOLING = 2
+    FOLD_RESHAPE = 3
+    TRANSFORM_MATMUL_TO_FC = 4
+    FOLD_BATCHNORM = 5
+    FOLD_CONV_AND_BN = 6
+    FOLD_DEPTHWISE_CONV_AND_BN = 7
+    TRANSFORM_GPU_WINOGRAD = 8
+    TRANSFORM_ADD_TO_BIASADD = 9
+    FOLD_BIASADD = 10
+    FOLD_ACTIVATION = 11
+    TRANSPOSE_FILTERS = 12
+    RESHAPE_FC_WEIGHT = 13
+    TRANSPOSE_DATA_FORMAT = 14
+    TRANSFORM_GLOBAL_CONV_TO_FC = 15
+    TRANSFORM_BUFFER_IMAGE = 16
+    ADD_DEVICE_AND_DATA_TYPE = 17
+    SORT_BY_EXECUTION = 18


 class ConverterInterface(object):
@@ -199,9 +201,11 @@ class ConverterOption(object):
        self._device = mace_pb2.CPU
        self._winograd_enabled = False
        self._transformer_option = [
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
            TransformerRule.REMOVE_IDENTITY_OP,
            TransformerRule.TRANSFORM_GLOBAL_POOLING,
-            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_RESHAPE,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC,
            TransformerRule.FOLD_BATCHNORM,
            TransformerRule.FOLD_CONV_AND_BN,
            TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -101,9 +101,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
            'AvgPool': self.convert_pooling,
            'MaxPool': self.convert_pooling,
            'Squeeze': self.convert_identity,
+            'MatMul': self.convert_matmul,
            'Identity': self.convert_identity,
            'Reshape': self.convert_reshape,
            'Shape': self.convert_nop,
+            'Transpose': self.convert_transpose,
            'Softmax': self.convert_softmax,
            'ResizeBilinear': self.convert_resize_bilinear,
            'Placeholder': self.convert_nop,
@@ -144,7 +146,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
            for i in xrange(len(op.input)):
                if op.input[i][-2:] == ':0':
                    op_name = op.input[i][:-2]
-                    if op_name in self._option.input_nodes:
+                    if op_name in self._option.input_nodes \
+                            or op_name in self._option.output_nodes:
                        op.input[i] = op_name
            for i in xrange(len(op.output)):
                if op.output[i][-2:] == ':0':
@@ -411,6 +414,10 @@ class TensorflowConverter(base_converter.ConverterInterface):

        self._skip_tensor.update(tf_op.inputs[-1].name)

+    def convert_matmul(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.MatMul.name
+
    def convert_reshape(self, tf_op):
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.Reshape.name
@@ -430,6 +437,20 @@ class TensorflowConverter(base_converter.ConverterInterface):

        shape_arg.ints.extend(shape_value)

+    def convert_transpose(self, tf_op):
+        perm = tf_op.inputs[1].eval().astype(np.int32)
+        ordered_perm = np.sort(perm)
+
+        mace_check(np.array_equal(perm, ordered_perm),
+                   "Transpose not supported yet, only internal transpose"
+                   " in composed ops might be supported")
+
+        op = self.convert_general_op(tf_op)
+        op.type = 'Identity'
+        del op.input[1:]
+
+        self._skip_tensor.add(tf_op.inputs[1].name)
+
    def convert_mean(self, tf_op):
        op = self.convert_general_op(tf_op)
        del op.input[1:]

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -53,9 +53,11 @@ class Transformer(base_converter.ConverterInterface):
    def __init__(self, option, model):
        # DO NOT reorder the following transformers
        self._registered_transformers_order = [
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
            TransformerRule.REMOVE_IDENTITY_OP,
            TransformerRule.TRANSFORM_GLOBAL_POOLING,
-            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_RESHAPE,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC,
            TransformerRule.FOLD_BATCHNORM,
            TransformerRule.FOLD_CONV_AND_BN,
            TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
@@ -72,10 +74,14 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.SORT_BY_EXECUTION,
        ]
        self._registered_transformers = {
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP:
+                self.remove_useless_reshape_op,
            TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op,
            TransformerRule.TRANSFORM_GLOBAL_POOLING:
                self.transform_global_pooling,
-            TransformerRule.FOLD_SOFTMAX: self.fold_softmax,
+            TransformerRule.FOLD_RESHAPE: self.fold_reshape,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC:
+                self.transform_matmul_to_fc,
            TransformerRule.FOLD_BATCHNORM: self.fold_batchnorm,
            TransformerRule.FOLD_CONV_AND_BN:
                self.fold_conv_and_bn,  # data_format related
@@ -161,18 +167,26 @@ class Transformer(base_converter.ConverterInterface):
            for output_tensor in op.output:
                self._producer[output_tensor] = op
        for input_node in self._option.input_nodes.values():
-            op = mace_pb2.OperatorDef()
-            op.name = self.normalize_op_name(input_node.name)
-            op.type = 'Input'
-            op.output.extend(input_node.name)
-            output_shape = op.output_shape.add()
-            output_shape.dims.extend(input_node.shape)
-            if self._option.device == mace_pb2.CPU:
-                self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-                ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
-            else:
-                ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
-            self._producer[op.output[0]] = op
+            input_node_existed = False
+            for op in self._model.op:
+                if input_node.name in op.output:
+                    input_node_existed = True
+                    break
+            if not input_node_existed:
+                op = mace_pb2.OperatorDef()
+                op.name = self.normalize_op_name(input_node.name)
+                op.type = 'Input'
+                op.output.extend([input_node.name])
+                output_shape = op.output_shape.add()
+                output_shape.dims.extend(input_node.shape)
+                if ConverterUtil.data_format(
+                        self._consumers[input_node.name][0]) \
+                        == DataFormat.NCHW:
+                    self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
+                    ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+                else:
+                    ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+                self._producer[op.output[0]] = op

    @staticmethod
    def replace(obj_list, source, target):
@@ -191,6 +205,12 @@ class Transformer(base_converter.ConverterInterface):
    def normalize_op_name(name):
        return name.replace(':', '_')

+    def get_tensor_shape(self, tensor):
+        producer = self._producer[tensor]
+        for i in xrange(len(producer.output)):
+            if producer.output[i] == tensor:
+                return list(producer.output_shape[i].dims)
+
    def consumer_count(self, tensor_name):
        return len(self._consumers.get(tensor_name, []))

@@ -203,23 +223,68 @@ class Transformer(base_converter.ConverterInterface):

        return False

-    def replace_output_node(self, op):
-        """if it is an output node, change output node to the op before it"""
-        if self.is_op_output_node(op):
-            real_output_node = self._producer[op.input[0]]
-            self.replace(real_output_node.output, op.input[0], op.output[0])
-            print("change %s to %s" % (real_output_node.name, op.name))
+    def safe_remove_node(self, op, replace_op):
+        """remove op.
+        1. change the inputs of its consumers to the outputs of replace_op
+        2. if the op is output node, change output node to replace op"""
+
+        if replace_op is None:
+            # When no replace op specified, we change the inputs of
+            # its consumers to the input of the op. This handles the case
+            # that the op is identity op and its input is a tensor.
+            mace_check(len(op.output) == 1 and len(op.input) == 1,
+                       "cannot remove op that w/o replace op specified"
+                       " and input/output length > 1" + str(op))
+
+            for consumer_op in self._consumers.get(op.output[0], []):
+                self.replace(consumer_op.input, op.output[0], op.input[0])
+
+            mace_check(op.output[0] not in self._option.output_nodes,
+                       "cannot remove op that is output node")
+        else:
+            mace_check(len(op.output) == len(replace_op.output),
+                       "cannot remove op since len(op.output) "
+                       "!= len(replace_op.output)")
+
+            for i in xrange(len(op.output)):
+                for consumer_op in self._consumers.get(op.output[i], []):
+                    self.replace(consumer_op.input,
+                                 op.output[i],
+                                 replace_op.output[i])
+
+            # if the op is output node, change replace_op output name to the op
+            # output name
+            for i in xrange(len(op.output)):
+                if op.output[i] in self._option.output_nodes:
+                    for consumer in self._consumers.get(
+                            replace_op.output[i], []):
+                        self.replace(consumer.input,
+                                     replace_op.output[i],
+                                     op.output[i])
+                    replace_op.output[i] = op.output[i]
+
+        self._model.op.remove(op)
+
+    def remove_useless_reshape_op(self):
+        net = self._model
+        for op in net.op:
+            if op.type == MaceOp.Reshape.name:
+                shape = list(ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_shape_str).ints)
+                if shape == self.get_tensor_shape(op.input[0]):
+                    print("Remove useless reshape: %s(%s)"
+                          % (op.name, op.type))
+                    op.type = 'Identity'
+
+        return False

    def remove_identity_op(self):
        net = self._model
        for op in net.op:
            if op.type == 'Identity':
                print("Remove identity: %s(%s)" % (op.name, op.type))
-                for consumer_op in self._consumers.get(op.output[0], []):
-                    Transformer.replace(consumer_op.input, op.output[0],
-                                        op.input[0])
-                self.replace_output_node(op)
-                net.op.remove(op)
+                self.safe_remove_node(op,
+                                      self._producer.get(op.input[0], None))
                return True

        return False
@@ -264,10 +329,10 @@ class Transformer(base_converter.ConverterInterface):
                        and len(self._consts[consumer_op.input[1]].dims) == 1:
                    print("Fold batchnorm: %s(%s)" % (op.name, op.type))
                    consumer_op.type = MaceOp.FoldedBatchNorm.name
-                    inputs = [op.input[0], op.input[1], consumer_op.input[1]]
-                    consumer_op.input[:] = inputs[:]
+                    consumer_op.input[:] = [op.input[0], op.input[1],
+                                            consumer_op.input[1]]

-                    net.op.remove(op)
+                    self.safe_remove_node(op, None)
                    return True

        return False
@@ -514,7 +579,7 @@ class Transformer(base_converter.ConverterInterface):
                    filter.float_data[:] = weight_tensor_value.flat[:]
                    filter.dims[:] = weight_tensor_value.shape[:]

-                    net.op.remove(op)
+                    self.safe_remove_node(op, iwt_op)

        return False

@@ -544,10 +609,8 @@ class Transformer(base_converter.ConverterInterface):
                consumer_op = self._consumers[op.output[0]][0]
                if consumer_op.type == MaceOp.BiasAdd.name:
                    print("Fold biasadd: %s(%s)" % (op.name, op.type))
-                    op.name = consumer_op.name
                    op.input.append(consumer_op.input[1])
-                    op.output[0] = consumer_op.output[0]
-                    net.op.remove(consumer_op)
+                    self.safe_remove_node(consumer_op, op)
                    return True

        return False
@@ -575,7 +638,7 @@ class Transformer(base_converter.ConverterInterface):
                                or arg.name == MaceKeyword.mace_activation_max_limit_str:  # noqa
                            op.arg.extend([arg])

-                    net.op.remove(consumer_op)
+                    self.safe_remove_node(consumer_op, op)
                    return True

        return False
@@ -651,11 +714,14 @@ class Transformer(base_converter.ConverterInterface):
                op.output.extend([input_node.name])
                output_shape = op.output_shape.add()
                output_shape.dims.extend(input_node.shape)
+                self.transpose_shape(output_shape.dims, [0, 3, 1, 2])

                dims_arg = op.arg.add()
                dims_arg.name = MaceKeyword.mace_dims_str
                dims_arg.ints.extend([0, 3, 1, 2])

+                ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+
            for output_node in self._option.output_nodes.values():
                output_name = MaceKeyword.mace_output_node_name \
                              + '_' + output_node.name
@@ -673,6 +739,8 @@ class Transformer(base_converter.ConverterInterface):
                dims_arg.name = MaceKeyword.mace_dims_str
                dims_arg.ints.extend([0, 2, 3, 1])

+                ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+
        return False

    def transpose_filters(self):
@@ -695,21 +763,29 @@ class Transformer(base_converter.ConverterInterface):
                        filter_data = filter_data.transpose(3, 2, 0, 1)
                        filter.float_data[:] = filter_data.flat
                        filter.dims[:] = filter_data.shape
+                if op.type == MaceOp.FullyConnected.name:
+                    weight = self._consts[op.input[1]]
+                    weight_data = np.array(weight.float_data).reshape(
+                        weight.dims)
+                    weight_data = weight_data.transpose(1, 0)
+                    weight.float_data[:] = weight_data.flat
+                    weight.dims[:] = weight_data.shape
            self.set_filter_format(FilterFormat.OIHW)

        return False

    def reshape_fc_weight(self):
+        print("Reshape fully connected weight shape")
        net = self._model
        for op in net.op:
            if op.type == MaceOp.FullyConnected.name:
                weight = self._consts[op.input[1]]
-                # NCHW
-                input_shape = list(self._producer[op.input[0]]
-                                   .output_shape[0].dims)
-                weight_shape = [weight.dims[0]] + input_shape[1:]
-                del weight.dims[:]
-                weight.dims.extend(weight_shape)
+                input_op = self._producer[op.input[0]]
+                input_shape = list(input_op.output_shape[0].dims)
+                input_data_format = ConverterUtil.data_format(input_op)
+                weight.dims[:] = [weight.dims[0]] + input_shape[1:]
+                if input_data_format == DataFormat.NHWC:
+                    self.transpose_shape(weight.dims, [0, 3, 1, 2])

        return False

@@ -788,6 +864,8 @@ class Transformer(base_converter.ConverterInterface):
            arg.name = MaceKeyword.mace_buffer_type
            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value

+            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
+
        for output_node in self._option.output_nodes.values():
            output_name = MaceKeyword.mace_output_node_name \
                          + '_' + output_node.name
@@ -803,14 +881,16 @@ class Transformer(base_converter.ConverterInterface):
            arg.name = MaceKeyword.mace_buffer_type
            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value

+            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
+
        return False

-    def fold_softmax(self):
+    def fold_reshape(self):
        changed = False
        net = self._model
        for op in net.op:
-            if op.type == MaceOp.Softmax.name:
-                print("Fold softmax: %s(%s)" % (op.name, op.type))
+            if op.type == MaceOp.Softmax.name or op.type == MaceOp.MatMul.name:
+                print("Fold reshape: %s(%s)" % (op.name, op.type))
                if self.consumer_count(op.output[0]) == 1:
                    consumer = self._consumers[op.output[0]][0]
                    if consumer.type == MaceOp.Reshape.name:
@@ -818,15 +898,14 @@ class Transformer(base_converter.ConverterInterface):
                                                      MaceKeyword.mace_shape_str).ints  # noqa
                        del op.output_shape[0].dims[:]
                        op.output_shape[0].dims.extend(shape)
-                        self.replace_output_node(consumer)
-                        net.op.remove(consumer)
+                        self.safe_remove_node(consumer, op)
                        changed = True

                    producer = self._producer[op.input[0]]
                    if producer.type == MaceOp.Reshape.name:
-                        op.input[0] = producer.input[0]
-                        self.replace_output_node(producer)
-                        net.op.remove(producer)
+                        self.safe_remove_node(producer,
+                                              self._producer[
+                                                  producer.input[0]])
                        changed = True

                if len(op.output_shape[0].dims) < 4:
@@ -839,6 +918,20 @@ class Transformer(base_converter.ConverterInterface):

        return False

+    def transform_matmul_to_fc(self):
+        net = self._model
+        for op in net.op:
+            if op.type == MaceOp.MatMul.name:
+                input_shape = self.get_tensor_shape(op.input[0])
+                _, h, w, _ = self.sort_feature_map_shape(input_shape,
+                                                         ConverterUtil.data_format(self._producer[op.input[0]]))  # noqa
+                if h == 1 and w == 1 and op.input[1] in self._consts:
+                    weight = self._consts[op.input[1]]
+                    if len(weight.dims) == 2:
+                        op.type = MaceOp.FullyConnected.name
+
+        return False
+
    def transform_global_conv_to_fc(self):
        """Transform global conv to fc should be placed after transposing
        input/output and filter"""
@@ -917,4 +1010,8 @@ class Transformer(base_converter.ConverterInterface):

        del net.op[:]
        net.op.extend(sorted_nodes)
+
+        print("Final ops:")
+        for op in net.op:
+            print("%s (%s)" % (op.name, op.type))
        return False
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -64,26 +64,25 @@ MaceStatus CreateMaceEngineFromCode(
  }
  const unsigned char * model_data = nullptr;
  NetDef net_def;
+  MaceStatus status = MaceStatus::MACE_SUCCESS;
  switch (model_name_map[model_name]) {
 {% for i in range(model_tags |length) %}
   case {{ i }}:
    model_data =
        mace::{{model_tags[i]}}::LoadModelData(model_data_file);
    net_def = mace::{{model_tags[i]}}::CreateNet();
-
-    engine->reset(
-        new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes,
-                             model_data));
+    engine->reset(new mace::MaceEngine(device_type));
+    status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data);
    if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
      mace::{{model_tags[i]}}::UnloadModelData(model_data);
    }
    break;
 {% endfor %}
   default:
-     return MaceStatus::MACE_INVALID_ARGS;
+     status = MaceStatus::MACE_INVALID_ARGS;
  }

-  return MaceStatus::MACE_SUCCESS;
+  return status;
 }
 {% else %}
 MaceStatus CreateMaceEngineFromCode(

--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -95,12 +95,19 @@ def add_shape_const_node(net_def, op, values, name):


 def convert_op_outputs(mace_op_def, tf_op):
+    mace_op_def.out_max_byte_size.extend(
+        [max_elem_size(output) for output in tf_op.outputs])
    mace_op_def.output_type.extend(
        [tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs])
    output_shapes = []
    for output in tf_op.outputs:
        output_shape = mace_pb2.OutputShape()
-        output_shape.dims.extend(output.shape.as_list())
+        shape_list = output.shape.as_list()
+        if not shape_list:
+            shape_list = [1]
+        elif len(shape_list) == 2:
+            shape_list = [1, 1, shape_list[0], shape_list[1]]
+        output_shape.dims.extend(shape_list)
        output_shapes.append(output_shape)
    mace_op_def.output_shape.extend(output_shapes)

@@ -159,8 +166,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
            op_def.input.append(input_tensor.name)
            op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
            op_def.input.extend([min_tensor.name, max_tensor.name])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in quantize_op.outputs])
            convert_op_outputs(op_def, quantize_op)
        elif len(first_op.outputs) > 0 and \
            first_op.type == 'QuantizedReshape' and \
@@ -193,9 +198,71 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
            op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
            op_def.input.extend(
                [input_tensor.name, min_tensor.name, max_tensor.name])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in quantize_reshape_op.outputs])
            convert_op_outputs(op_def, quantize_reshape_op)
+        # remove Squeeze
+        elif len(first_op.outputs) > 0 and \
+            first_op.type == 'Requantize' and \
+            len(first_op.outputs[0].consumers()) > 0 and \
+            first_op.outputs[0].consumers()[0].type == 'Dequantize' and \
+            len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \
+            > 0 and \
+            first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \
+                == 'Squeeze':
+            dequantize_op = first_op.outputs[0].consumers()[0]
+            squeeze_op = dequantize_op.outputs[0].consumers()[0]
+            reshape_op = squeeze_op.outputs[0].consumers()[0]
+            min_op = reshape_op.outputs[0].consumers()[0]
+            max_op = reshape_op.outputs[0].consumers()[1]
+            quantize_op = min_op.outputs[0].consumers()[0]
+
+            resolved_ops.add(dequantize_op.name)
+            resolved_ops.add(squeeze_op.name)
+            resolved_ops.add(reshape_op.name)
+            resolved_ops.add(min_op.name)
+            resolved_ops.add(max_op.name)
+            resolved_ops.add(quantize_op.name)
+
+            op_def.name = quantize_op.name
+            op_def.input.extend([t.name for t in first_op.inputs])
+            convert_op_outputs(op_def, quantize_op)
+
+            # Squeeze -> Softmax
+            next_op = quantize_op.outputs[0].consumers()[0] \
+                if len(quantize_op.outputs) > 0 else None
+            dequantize_op = next_op.outputs[0].consumers()[0] \
+                if next_op and len(next_op.outputs) > 0 and \
+                next_op.type == 'QuantizedReshape' and \
+                len(next_op.outputs[0].consumers()) > 0 else None
+            softmax_op = dequantize_op.outputs[0].consumers()[0]\
+                if dequantize_op and len(dequantize_op.outputs) > 0 and \
+                dequantize_op.type == 'Dequantize' and \
+                len(dequantize_op.outputs[0].consumers()) > 0 else None
+            if softmax_op and softmax_op.type == 'Softmax':
+                reshape_op = softmax_op.outputs[0].consumers()[0]
+                min_op = reshape_op.outputs[0].consumers()[0]
+                max_op = reshape_op.outputs[0].consumers()[1]
+                quantize_op = min_op.outputs[0].consumers()[0]
+                quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
+
+                resolved_ops.add(next_op.name)
+                resolved_ops.add(dequantize_op.name)
+                resolved_ops.add(softmax_op.name)
+                resolved_ops.add(reshape_op.name)
+                resolved_ops.add(min_op.name)
+                resolved_ops.add(max_op.name)
+                resolved_ops.add(quantize_op.name)
+                resolved_ops.add(quantize_reshape_op.name)
+
+                softmax_op_def = net_def.op.add()
+                softmax_op_def.padding = padding_mode['NA']
+                softmax_op_def.name = quantize_reshape_op.name
+                softmax_op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
+                softmax_op_def.input.extend([
+                    get_tensor_name_from_op(op_def.name, 0),
+                    get_tensor_name_from_op(op_def.name, 1),
+                    get_tensor_name_from_op(op_def.name, 2)])
+                convert_op_outputs(softmax_op_def, quantize_reshape_op)
+
        elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \
            len(first_op.outputs[0].consumers()) > 0 and \
                first_op.outputs[0].consumers()[0].type == 'Tanh':
@@ -220,8 +287,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
                op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
                op_def.input.extend(
                    [input_tensor.name, min_tensor.name, max_tensor.name])
-                op_def.out_max_byte_size.extend(
-                    [max_elem_size(out) for out in quantize_op.outputs])
                convert_op_outputs(op_def, quantize_op)
            # tanh is last op
            else:
@@ -251,8 +316,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
                    get_tensor_name_from_op(op_def.name, 1),
                    get_tensor_name_from_op(op_def.name, 2)
                ])
-                new_tanh_op_def.out_max_byte_size.extend(
-                    [max_elem_size(tanh_op.outputs[0])])
                convert_op_outputs(new_tanh_op_def, tanh_op)
        elif has_padding_and_strides(first_op):
            op_def.padding = padding_mode[first_op.get_attr('padding')]
@@ -266,19 +329,13 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
            strides_tensor = add_shape_const_node(net_def, first_op, strides,
                                                  'strides')
            op_def.input.extend([strides_tensor])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
            convert_op_outputs(op_def, first_op)
        elif is_node_flatten_reshape(first_op):
            op_def.type = 'Flatten'
-            op_def.input.extend([t.name for t in first_op.inputs])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
+            op_def.input.extend([first_op.inputs[0].name])
            convert_op_outputs(op_def, first_op)
        elif dsp_ops.has_op(first_op.type):
            op_def.input.extend([t.name for t in first_op.inputs])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
            convert_op_outputs(op_def, first_op)
        else:
            raise Exception('Unsupported op: ', first_op)
@@ -478,7 +535,8 @@ def fuse_quantize(net_def, input_node, output_node):
                skip_ops = skip_ops.union(
                    [flatten_op.name, minf_op.name, maxf_op.name])
                skip_tensors = skip_tensors.union(
-                    [flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+                    [minf_op.input[0], maxf_op.input[0],
+                     quantize_op.input[1], quantize_op.input[2]])
                quantize_op.type = 'AutoQuantize'
                del quantize_op.input[1:]


--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -318,8 +318,10 @@ void MaceRunFunc(const int in_out_size) {
      new FileStorageFactory(file_path));
  mace::SetKVStorageFactory(storage_factory);

-  MaceEngine engine(&net_def, device, input_names, output_names,
-                    reinterpret_cast<unsigned char *>(data.data()));
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names,
+      reinterpret_cast<unsigned char *>(data.data()));
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);

  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -323,8 +323,10 @@ void MaceRun(const int in_out_size,
                         &net_def);
  }

-  MaceEngine engine(&net_def, device, input_names, output_names,
-                    reinterpret_cast<unsigned char *>(data.data()));
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names,
+      reinterpret_cast<unsigned char *>(data.data()));
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);

  std::map<std::string, mace::MaceTensor> inputs;
  std::map<std::string, mace::MaceTensor> outputs;

--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
@@ -15,6 +15,7 @@
 #include "mace/utils/logging.h"

 #include <stdlib.h>
+#include <string.h>
 #if defined(ANDROID) || defined(__ANDROID__)
 #include <android/log.h>
 #include <iostream>