diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 51806a6863612decfd5b3fb30ba2ef1918e63f40..91d057a49e2a0a71a3e160f0f772bfeaf8891dca 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,8 @@ docs:
     - cd docs
     - make html
     - CI_LATEST_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/latest
-    - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_BUILD_ID
+    - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_PIPELINE_ID
+    - rm -rf $CI_JOB_OUTPUT_PATH
     - mkdir -p $CI_JOB_OUTPUT_PATH
     - cp -r _build/html $CI_JOB_OUTPUT_PATH/docs
     - rm -rf $CI_LATEST_OUTPUT_PATH
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 32d0cef583f71ff0a19593f3795b441c9103df99..8a8c089c493baa0982fbd68e8fa815ed68ea3e45 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -110,7 +110,8 @@ RUN apt-get install -y --no-install-recommends \
 
 # Install tools
 RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools
-RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.6.0 \
+RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.7.0 \
+    numpy>=1.14.0 \
     scipy \
     jinja2 \
     pyyaml \
diff --git a/docs/faq.md b/docs/faq.md
index 01d1cd9acf12d39526802c714b39c9a922cbcdbf..c8ee4a9095f050e12d45c9a5ea6cab00a57aecd2 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -1,6 +1,17 @@
 Frequently asked questions
 ==========================
 
+Does the tensor data consume extra memory when compiled into C++ code?
+----------------------------------------------------------------------
+When compiled into C++ code, the data will be mmaped by the system loader.
+For CPU runtime, the tensor data are used without memory copy.
+For GPU and DSP runtime, the tensor data is used once during model
+initialization. The operating system is free to swap the pages out, however,
+it still consumes virtual memory space. So generally speaking, it takes
+no extra physical memory. If you are short of virtual memory space (this
+should be very rare), you can choose load the tensor data from a file, which
+can be unmapped after initialization.
+
 Why is the generated static library file size so huge?
 -------------------------------------------------------
 The static library is simply an archive of a set of object files which are
diff --git a/docs/getting_started/docker.md b/docs/getting_started/docker.md
deleted file mode 100644
index f58f4a64f7320552849a543fca23ab20ea71b5e1..0000000000000000000000000000000000000000
--- a/docs/getting_started/docker.md
+++ /dev/null
@@ -1,27 +0,0 @@
-Docker Images
-=============
-
-* Login in [Xiaomi Docker Registry](http://docs.api.xiaomi.net/docker-registry/)
-
-```
-docker login cr.d.xiaomi.net
-```
-
-* Build with `Dockerfile`
-
-```
-docker build -t cr.d.xiaomi.net/mace/mace-dev
-```
-
-* Pull image from docker registry
-
-```
-docker pull cr.d.xiaomi.net/mace/mace-dev
-```
-
-* Create container
-
-```
-# Set 'host' network to use ADB
-docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
-```
diff --git a/docs/getting_started/how_to_build.rst b/docs/getting_started/how_to_build.rst
index 47781280c28220262be074192bd241071f04602a..43178cf946ac54bc81689bc6e90d993bcc02fc91 100644
--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -33,11 +33,13 @@ How to build
 +=====================+=================+===================================================================================================+
 | bazel               | >= 0.5.4        | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| android-ndk         | r12c            | -                                                                                                 |
+| android-ndk         | r15c,r16b       | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | adb                 | >= 1.0.32       | apt install -y android-tools-adb                                                                  |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| tensorflow          | 1.4.0           | pip install tensorflow==1.4.0                                                                     |
+| tensorflow          | 1.7.0           | pip install tensorflow==1.7.0                                                                     |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| numpy               | >= 1.14.0       | pip install numpy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | scipy               | >= 1.0.0        | pip install scipy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
@@ -45,9 +47,43 @@ How to build
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | PyYaml              | >= 3.12         | pip install pyyaml                                                                                |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| sh                  | >= 1.12.14      | pip install sh                                                                                    |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| filelock            | >= 3.0.0        | pip install filelock                                                                              |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | docker(for caffe)   | >= 17.09.0-ce   | `install doc <https://docs.docker.com/install/linux/docker-ce/ubuntu/#set-up-the-repository>`__   |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 
+Docker Images
+----------------
+
+* Login in `Xiaomi Docker Registry <http://docs.api.xiaomi.net/docker-registry/>`__
+
+.. code:: sh
+
+    docker login cr.d.xiaomi.net
+
+* Build with Dockerfile
+
+.. code:: sh
+
+    docker build -t cr.d.xiaomi.net/mace/mace-dev
+
+
+* Pull image from docker registry
+
+.. code:: sh
+
+    docker pull cr.d.xiaomi.net/mace/mace-dev
+
+* Create container
+
+.. code:: sh
+
+    # Set 'host' network to use ADB
+    docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash
+
+
 使用简介
 --------
 
@@ -199,29 +235,47 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级
 
     // 引入头文件
     #include "mace/public/mace.h"
-    #include "mace/public/{MODEL_TAG}.h"
+    #include "mace/public/mace_engine_factory.h"
 
-    // 0. 设置内部存储
+    // 0. 设置内部存储（设置一次即可）
     const std::string file_path ="/path/to/store/internel/files";
     std::shared_ptr<KVStorageFactory> storage_factory(
         new FileStorageFactory(file_path));
     ConfigKVStorageFactory(storage_factory);
 
-    //1. 从文件或代码中Load模型数据，也可通过自定义的方式来Load (例如可自己实现压缩加密等)
-    // 如果使用的是数据嵌入的方式，将参数设为nullptr。
-    unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
-
-    //2. 创建net对象
-    NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+    //1. 声明设备类型(必须与build时指定的runtime一致）
+    DeviceType device_type = DeviceType::GPU;
 
-    //3. 声明设备类型(必须与build时指定的runtime一致）
-    DeviceType device_type = DeviceType::OPENCL;
-
-    //4. 定义输入输出名称数组
+    //2. 定义输入输出名称数组
     std::vector<std::string> input_names = {...};
     std::vector<std::string> output_names = {...};
 
-    //5. 创建输入输出对象
+    //3. 创建MaceEngine对象
+    std::shared_ptr<mace::MaceEngine> engine;
+    MaceStatus create_engine_status;
+    // Create Engine
+    if (model_data_file.empty()) {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           nullptr,
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    } else {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           model_data_file.c_str(),
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    }
+    if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+      // do something
+    }
+
+    //4. 创建输入输出对象
     std::map<std::string, mace::MaceTensor> inputs;
     std::map<std::string, mace::MaceTensor> outputs;
     for (size_t i = 0; i < input_count; ++i) {
@@ -246,14 +300,6 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级
       outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
     }
 
-    //6. 创建MaceEngine对象
-    mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-
-    //7. 如果设备类型是OPENCL或HEXAGON，可以在此释放model_data
-    if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
-      mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-    }
-
-    //8. 执行模型，得到结果
+    //5. 执行模型，得到结果
     engine.Run(inputs, &outputs);
 
diff --git a/docs/getting_started/introduction.md b/docs/getting_started/introduction.md
deleted file mode 100644
index 8cfc6b72537f996fa6f7b220167caf532effac3a..0000000000000000000000000000000000000000
--- a/docs/getting_started/introduction.md
+++ /dev/null
@@ -1,8 +0,0 @@
-Introduction
-============
-
-TODO: describe the conceptions and workflow with diagram.
-![alt text](workflow.jpg "MiAI workflow")
-
-TODO: describe the runtime.
-
diff --git a/docs/getting_started/introduction.rst b/docs/getting_started/introduction.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4f9d4c8ae38c085bb0d5e54c587a65390950a5dc
--- /dev/null
+++ b/docs/getting_started/introduction.rst
@@ -0,0 +1,46 @@
+Introduction
+============
+
+MiAI Compute Engine is a deep learning inference framework optimized for
+mobile heterogeneous computing platforms. The following figure shows the
+overall architecture.
+
+.. image:: mace-arch.png
+   :scale: 40 %
+   :align: center
+
+Model format
+------------
+
+MiAI Compute Engine defines a customized model format which is similar to
+Caffe2. The MiAI model can be converted from exported models by TensorFlow
+and Caffe. We define a YAML schema to describe the model deployment. In the
+next chapter, there is a detailed guide showing how to create this YAML file.
+
+Model conversion
+----------------
+
+Currently, we provide model converters for TensorFlow and Caffe. And
+more frameworks will be supported in the future.
+
+Model loading
+-------------
+
+The MiAI model format contains two parts: the model graph definition and
+the model parameter tensors. The graph part utilizes Protocol Buffers
+for serialization. All the model parameter tensors are concatenated
+together into a continuous array, and we call this array tensor data in
+the following paragraphs. In the model graph, the tensor data offsets
+and lengths are recorded.
+
+The models can be loaded in 3 ways:
+
+1. Both model graph and tensor data are dynamically loaded externally
+   (by default, from file system, but the users are free to choose their own
+   implementations, for example, with compression or encryption). This
+   approach provides the most flexibility but the weakest model protection.
+2. Both model graph and tensor data are converted into C++ code and loaded
+   by executing the compiled code. This approach provides the strongest
+   model protection and simplest deployment.
+3. The model graph is converted into C++ code and constructed as the second
+   approach, and the tensor data is loaded externally as the first approach.
diff --git a/docs/getting_started/mace-arch.png b/docs/getting_started/mace-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..bfa4928b83df9604273a3bca4633bad40c0299ad
Binary files /dev/null and b/docs/getting_started/mace-arch.png differ
diff --git a/docs/getting_started/op_lists.rst b/docs/getting_started/op_lists.rst
index 803d32170fb6eaa6a2b742a376158b00bab2ea7c..9a1d5425b1a768a0759a06f18e2f4a2d27f4e1b2 100644
--- a/docs/getting_started/op_lists.rst
+++ b/docs/getting_started/op_lists.rst
@@ -6,20 +6,22 @@ Operator lists
     :widths: auto
     :header: "Operator","Android NN","Supported","Remark"
 
-    "ADD","Y","Y",""
     "AVERAGE_POOL_2D","Y","Y",""
     "BATCH_NORM","","Y","Fusion with activation is supported"
+    "BATCH_TO_SPACE_ND","Y","Y",""
     "BIAS_ADD","","Y",""
     "CHANNEL_SHUFFLE","","Y",""
-    "CONCATENATION","Y","Y",""
+    "CONCATENATION","Y","Y","Only support channel axis concatenation"
     "CONV_2D","Y","Y","Fusion with BN and activation layer is supported"
+    "DECONV_2D","N","Y","Only tensorflow model is supported"
     "DEPTHWISE_CONV_2D","Y","Y","Only multiplier = 1 is supported; Fusion is supported"
     "DEPTH_TO_SPACE","Y","Y",""
-    "DEQUANTIZE","Y","",""
+    "DEQUANTIZE","Y","Y","Model quantization will be supported later"
+    "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW"
     "EMBEDDING_LOOKUP","Y","",""
     "FLOOR","Y","",""
     "FULLY_CONNECTED","Y","Y",""
-    "GROUP_CONV_2D","","",""
+    "GROUP_CONV_2D","","","Caffe model with group count = channel count is supported"
     "HASHTABLE_LOOKUP","Y","",""
     "L2_NORMALIZATION","Y","",""
     "L2_POOL_2D","Y","",""
@@ -29,18 +31,20 @@ Operator lists
     "LSTM","Y","",""
     "MATMUL","","Y",""
     "MAX_POOL_2D","Y","Y",""
-    "MUL","Y","",""
+    "PAD", "N","Y",""
     "PSROI_ALIGN","","Y",""
-    "PRELU","","Y",""
+    "PRELU","","Y","Only caffe model is supported"
     "RELU","Y","Y",""
     "RELU1","Y","Y",""
     "RELU6","Y","Y",""
     "RELUX","","Y",""
-    "RESHAPE","Y","Y","Limited support"
+    "RESHAPE","Y","Y","Limited support: only internal use of reshape in composed operations is supported"
     "RESIZE_BILINEAR","Y","Y",""
     "RNN","Y","",""
     "RPN_PROPOSAL_LAYER","","Y",""
+    "SLICE","N","Y","Only support channel axis slice"
     "SOFTMAX","Y","Y",""
+    "SPACE_TO_BATCH_ND","Y", "Y",""
     "SPACE_TO_DEPTH","Y","Y",""
     "SVDF","Y","",""
     "TANH","Y","Y",""
diff --git a/docs/getting_started/workflow.jpg b/docs/getting_started/workflow.jpg
deleted file mode 100644
index e00ebea13ac5e261feb38306938f97fa995d47c7..0000000000000000000000000000000000000000
Binary files a/docs/getting_started/workflow.jpg and /dev/null differ
diff --git a/docs/index.rst b/docs/index.rst
index 70cfab0fdd9f1c69a62108dd0773f9dbadb21d37..a42b1655448dad6c622882f7359918b851a41576 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,7 +11,6 @@ The main documentation is organized into the following sections:
 
    getting_started/introduction
    getting_started/create_a_model_deployment
-   getting_started/docker
    getting_started/how_to_build
    getting_started/op_lists
 
diff --git a/mace/core/BUILD b/mace/core/BUILD
index abc5094e2caa5d8243958f39d6051ff920716446..9917dfe3254c11df195187511deb1f1afacd7df2 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -15,6 +15,7 @@ load(
     "if_production_mode",
     "if_not_production_mode",
     "if_openmp_enabled",
+    "if_neon_enabled",
 )
 
 cc_library(
@@ -51,7 +52,11 @@ cc_library(
         "-DMACE_ENABLE_OPENMP",
     ]) + if_android([
         "-DMACE_ENABLE_OPENCL",
-    ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
+    ]) + if_hexagon_enabled([
+        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]),
     linkopts = ["-ldl"] + if_android([
         "-pie",
         "-lm",
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 8c73025b4923cd860f3a47f3109cc4325728b259..7ab701ddd21b15b0bb88b258d9b2f85801b8dda2 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -16,6 +16,7 @@
 #define MACE_CORE_ALLOCATOR_H_
 
 #include <stdlib.h>
+#include <string.h>
 #include <map>
 #include <limits>
 #include <vector>
@@ -42,9 +43,10 @@ class Allocator {
  public:
   Allocator() {}
   virtual ~Allocator() noexcept {}
-  virtual void *New(size_t nbytes) const = 0;
-  virtual void *NewImage(const std::vector<size_t> &image_shape,
-                         const DataType dt) const = 0;
+  virtual MaceStatus New(size_t nbytes, void **result) const = 0;
+  virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                              const DataType dt,
+                              void **result) const = 0;
   virtual void Delete(void *data) const = 0;
   virtual void DeleteImage(void *data) const = 0;
   virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
@@ -53,44 +55,54 @@ class Allocator {
                          std::vector<size_t> *mapped_image_pitch) const = 0;
   virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
   virtual bool OnHost() const = 0;
-
-  template <typename T>
-  T *New(size_t num_elements) {
-    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
-      return nullptr;
-    }
-    void *p = New(sizeof(T) * num_elements);
-    T *typed_p = reinterpret_cast<T *>(p);
-    return typed_p;
-  }
 };
 
 class CPUAllocator : public Allocator {
  public:
   ~CPUAllocator() override {}
-  void *New(size_t nbytes) const override {
+  MaceStatus New(size_t nbytes, void **result) const override {
     VLOG(3) << "Allocate CPU buffer: " << nbytes;
+    if (nbytes == 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
     void *data = nullptr;
 #if defined(__ANDROID__) || defined(__hexagon__)
     data = memalign(kMaceAlignment, nbytes);
+    if (data == NULL) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #else
-    MACE_CHECK(posix_memalign(&data, kMaceAlignment, nbytes) == 0);
+    int ret = posix_memalign(&data, kMaceAlignment, nbytes);
+    if (ret != 0) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #endif
-    MACE_CHECK_NOTNULL(data);
     // TODO(heliangliang) This should be avoided sometimes
     memset(data, 0, nbytes);
-    return data;
+    *result = data;
+    return MaceStatus::MACE_SUCCESS;
   }
 
-  void *NewImage(const std::vector<size_t> &shape,
-                 const DataType dt) const override {
+  MaceStatus NewImage(const std::vector<size_t> &shape,
+                      const DataType dt,
+                      void **result) const override {
     MACE_UNUSED(shape);
     MACE_UNUSED(dt);
+    MACE_UNUSED(result);
     LOG(FATAL) << "Allocate CPU image";
-    return nullptr;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Delete(void *data) const override {
+    MACE_CHECK_NOTNULL(data);
     VLOG(3) << "Free CPU buffer";
     free(data);
   }
diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc
index 8db23cae206a46a949c63de4a794ab4eac784362..9cf40d93a0d682c8a6dd04e90e869dd8514a127b 100644
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -23,7 +23,8 @@ namespace mace {
 ArgumentHelper::ArgumentHelper(const OperatorDef &def) {
   for (auto &arg : def.arg()) {
     if (arg_map_.find(arg.name()) != arg_map_.end()) {
-      LOG(WARNING) << "Duplicated argument name found in operator def.";
+      LOG(WARNING) << "Duplicated argument name found in operator def: "
+                   << def.name() << " " << arg.name();
     }
 
     arg_map_[arg.name()] = arg;
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index f4b252a776296b1e065816c3a9b6288d13d03837..8ef5015c1c02c6f1e5e3feac761436cbff0cce48 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -38,6 +38,11 @@ class BufferBase {
 
   virtual void *raw_mutable_data() = 0;
 
+  virtual MaceStatus Allocate(index_t nbytes) = 0;
+
+  virtual MaceStatus Allocate(const std::vector<size_t> &shape,
+                              DataType data_type) = 0;
+
   virtual void *Map(index_t offset,
                     index_t length,
                     std::vector<size_t> *pitch) const = 0;
@@ -48,7 +53,7 @@ class BufferBase {
 
   virtual void UnMap() = 0;
 
-  virtual void Resize(index_t size) = 0;
+  virtual MaceStatus Resize(index_t nbytes) = 0;
 
   virtual void Copy(void *src, index_t offset, index_t length) = 0;
 
@@ -56,6 +61,8 @@ class BufferBase {
 
   virtual void Clear() = 0;
 
+  virtual void Clear(index_t size) = 0;
+
   virtual index_t offset() const { return 0; }
 
   template <typename T>
@@ -83,14 +90,6 @@ class Buffer : public BufferBase {
         mapped_buf_(nullptr),
         is_data_owner_(true) {}
 
-  Buffer(Allocator *allocator, index_t size)
-      : BufferBase(size),
-        allocator_(allocator),
-        mapped_buf_(nullptr),
-        is_data_owner_(true) {
-    buf_ = allocator->New(size);
-  }
-
   Buffer(Allocator *allocator, void *data, index_t size)
       : BufferBase(size),
         allocator_(allocator),
@@ -132,6 +131,31 @@ class Buffer : public BufferBase {
     }
   }
 
+  MaceStatus Allocate(index_t nbytes) {
+    if (nbytes <= 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
+    MACE_CHECK(is_data_owner_,
+               "data is not owned by this buffer, cannot reallocate");
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->Delete(buf_);
+    }
+    size_ = nbytes;
+    return allocator_->New(nbytes, &buf_);
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    if (shape.empty()) return MaceStatus::MACE_SUCCESS;
+    index_t nbytes = std::accumulate(shape.begin(), shape.end(),
+                                     1, std::multiplies<size_t>())
+        * GetEnumTypeSize(data_type);
+    return this->Allocate(nbytes);
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_CHECK_NOTNULL(buf_);
     MACE_UNUSED(pitch);
@@ -154,16 +178,17 @@ class Buffer : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t nbytes) {
     MACE_CHECK(is_data_owner_,
                "data is not owned by this buffer, cannot resize");
-    if (size != size_) {
+    if (nbytes != size_) {
       if (buf_ != nullptr) {
         allocator_->Delete(buf_);
       }
-      size_ = size;
-      buf_ = allocator_->New(size);
+      size_ = nbytes;
+      return allocator_->New(nbytes, &buf_);
     }
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -175,7 +200,11 @@ class Buffer : public BufferBase {
   bool OnHost() const { return allocator_->OnHost(); }
 
   void Clear() {
-    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size_);
+    Clear(size_);
+  }
+
+  void Clear(index_t size) {
+    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size);
   }
 
  protected:
@@ -195,18 +224,6 @@ class Image : public BufferBase {
         buf_(nullptr),
         mapped_buf_(nullptr) {}
 
-  Image(std::vector<size_t> shape, DataType data_type)
-      : BufferBase(
-            std::accumulate(
-                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
-            GetEnumTypeSize(data_type)),
-        allocator_(GetDeviceAllocator(GPU)),
-        mapped_buf_(nullptr) {
-    shape_ = shape;
-    data_type_ = data_type;
-    buf_ = allocator_->NewImage(shape, data_type);
-  }
-
   virtual ~Image() {
     if (mapped_buf_ != nullptr) {
       UnMap();
@@ -233,6 +250,29 @@ class Image : public BufferBase {
 
   std::vector<size_t> image_shape() const { return shape_; }
 
+  MaceStatus Allocate(index_t nbytes) {
+    MACE_UNUSED(nbytes);
+    LOG(FATAL) << "Image should not call this allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    index_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
+        GetEnumTypeSize(data_type);
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->DeleteImage(buf_);
+    }
+    size_ = size;
+    shape_ = shape;
+    data_type_ = data_type;
+    return allocator_->NewImage(shape, data_type, &buf_);
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_UNUSED(offset);
     MACE_UNUSED(length);
@@ -259,9 +299,10 @@ class Image : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
     MACE_UNUSED(size);
     MACE_NOT_IMPLEMENTED;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -277,6 +318,11 @@ class Image : public BufferBase {
     MACE_NOT_IMPLEMENTED;
   }
 
+  void Clear(index_t size) {
+    MACE_UNUSED(size);
+    MACE_NOT_IMPLEMENTED;
+  }
+
  private:
   Allocator *allocator_;
   std::vector<size_t> shape_;
@@ -339,6 +385,20 @@ class BufferSlice : public BufferBase {
     }
   }
 
+  MaceStatus Allocate(index_t size) {
+    MACE_UNUSED(size);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    MACE_UNUSED(shape);
+    MACE_UNUSED(data_type);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_UNUSED(offset);
     MACE_UNUSED(length);
@@ -364,9 +424,10 @@ class BufferSlice : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
     MACE_CHECK(size == size_, "resize buffer slice from ", size_,
       " to ", size, " is illegal");
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -381,7 +442,11 @@ class BufferSlice : public BufferBase {
   bool OnHost() const { return buffer_->OnHost(); }
 
   void Clear() {
-    memset(raw_mutable_data(), 0, size_);
+    Clear(size_);
+  }
+
+  void Clear(index_t size) {
+    memset(raw_mutable_data(), 0, size);
   }
 
  private:
@@ -396,20 +461,17 @@ class ScratchBuffer: public Buffer {
     : Buffer(allocator),
       offset_(0) {}
 
-  ScratchBuffer(Allocator *allocator, index_t size)
-    : Buffer(allocator, size),
-      offset_(0) {}
-
   ScratchBuffer(Allocator *allocator, void *data, index_t size)
     : Buffer(allocator, data, size),
       offset_(0) {}
 
   virtual ~ScratchBuffer() {}
 
-  void GrowSize(index_t size) {
+  MaceStatus GrowSize(index_t size) {
     if (size > size_) {
-      Resize(size);
+      return Resize(size);
     }
+    return MaceStatus::MACE_SUCCESS;
   }
 
   BufferSlice Scratch(index_t size) {
diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index 4bbcad2a135ea25b65abd55322797eba7e842e0c..bdf1fa894ebc3bc57db6bd4677dd07415a2c4b5b 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -90,13 +90,15 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 // Mace Engine
 class MaceEngine::Impl {
  public:
-  explicit Impl(const NetDef *net_def,
-                DeviceType device_type,
-                const std::vector<std::string> &input_nodes,
-                const std::vector<std::string> &output_nodes,
-                const unsigned char *model_data);
+  explicit Impl(DeviceType device_type);
+
   ~Impl();
 
+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes,
+                  const unsigned char *model_data);
+
   MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                  std::map<std::string, MaceTensor> *outputs,
                  RunMetadata *run_metadata);
@@ -113,11 +115,7 @@ class MaceEngine::Impl {
   DISABLE_COPY_AND_ASSIGN(Impl);
 };
 
-MaceEngine::Impl::Impl(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes,
-                       const unsigned char *model_data)
+MaceEngine::Impl::Impl(DeviceType device_type)
     : op_registry_(new OperatorRegistry()),
       device_type_(device_type),
       ws_(new Workspace()),
@@ -125,7 +123,13 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
 #ifdef MACE_ENABLE_HEXAGON
       , hexagon_controller_(nullptr)
 #endif
-{
+{}
+
+MaceStatus MaceEngine::Impl::Init(
+    const NetDef *net_def,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes,
+    const unsigned char *model_data) {
   LOG(INFO) << "MACE version: " << MaceVersion();
   // Set storage path for internal usage
   for (auto input_name : input_nodes) {
@@ -137,7 +141,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
                       GetDeviceAllocator(device_type_), DT_FLOAT);
   }
 #ifdef MACE_ENABLE_HEXAGON
-  if (device_type == HEXAGON) {
+  if (device_type_ == HEXAGON) {
     hexagon_controller_.reset(new HexagonControlWrapper());
     MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
@@ -153,18 +157,23 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
     }
   } else {
 #endif
-    ws_->LoadModelTensor(*net_def, device_type, model_data);
+    MaceStatus status =
+      ws_->LoadModelTensor(*net_def, device_type_, model_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      return status;
+    }
 
-    // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
+  // Init model
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
                          NetMode::INIT);
     if (!net->Run()) {
       LOG(FATAL) << "Net init run failed";
     }
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type);
+    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceEngine::Impl::~Impl() {
@@ -254,18 +263,18 @@ MaceStatus MaceEngine::Impl::Run(
   return MACE_SUCCESS;
 }
 
-MaceEngine::MaceEngine(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes,
-                       const unsigned char *model_data) {
-  impl_ = std::unique_ptr<MaceEngine::Impl>(
-      new MaceEngine::Impl(net_def, device_type, input_nodes, output_nodes,
-                           model_data));
-}
+MaceEngine::MaceEngine(DeviceType device_type):
+    impl_(new MaceEngine::Impl(device_type)) {}
 
 MaceEngine::~MaceEngine() = default;
 
+MaceStatus MaceEngine::Init(const NetDef *net_def,
+                            const std::vector<std::string> &input_nodes,
+                            const std::vector<std::string> &output_nodes,
+                            const unsigned char *model_data) {
+  return impl_->Init(net_def, input_nodes, output_nodes, model_data);
+}
+
 MaceStatus MaceEngine::Run(const std::map<std::string, MaceTensor> &inputs,
                            std::map<std::string, MaceTensor> *outputs,
                            RunMetadata *run_metadata) {
@@ -325,17 +334,18 @@ MaceStatus CreateMaceEngineFromPB(const std::string &model_data_file,
                              const_tensor.data_size() *
                              GetEnumTypeSize(const_tensor.data_type())));
   }
+
+  MaceStatus status;
   const unsigned char *model_data = nullptr;
   model_data = LoadModelData(model_data_file, model_data_size);
 
-  engine->reset(
-      new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes,
-                           model_data));
+  engine->reset(new mace::MaceEngine(device_type));
+  status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data);
 
   if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
     UnloadModelData(model_data, model_data_size);
   }
-  return MACE_SUCCESS;
+  return status;
 }
 
 }  // namespace mace
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 09891e8c4cfaab9afe9757dce75b13793e884e3f..23de679509bc1ea72ac7fab721bf76d8105c53ec 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,9 +18,11 @@
 #include <omp.h>
 #endif
 
+#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <string.h>
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -44,7 +46,7 @@ int GetCPUCount() {
     result = access(path, F_OK);
     if (result != 0) {
       if (errno != ENOENT) {
-        LOG(ERROR) << "Access " << path << " failed, errno: " << errno;
+        LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
       }
       return cpu_count;
     }
@@ -81,7 +83,7 @@ void SetThreadAffinity(cpu_set_t mask) {
   pid_t pid = syscall(SYS_gettid);
 #endif
   int err = sched_setaffinity(pid, sizeof(mask), &mask);
-  MACE_CHECK(err == 0, "set affinity error: ", errno);
+  MACE_CHECK(err == 0, "set affinity error: ", strerror(errno));
 }
 
 }  // namespace
@@ -101,7 +103,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
   for (int i = 0; i < cpu_count; ++i) {
     cpu_max_freq[i] = GetCPUMaxFreq(i);
     if (cpu_max_freq[i] == 0) {
-      LOG(WARNING) << "Cannot get cpu" << i
+      LOG(WARNING) << "Cannot get CPU" << i
                    << "'s max frequency info, maybe it is offline.";
       return MACE_INVALID_ARGS;
     }
@@ -128,13 +130,12 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
 
 void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
                                      const std::vector<int> &cpu_ids) {
+#ifdef MACE_ENABLE_OPENMP
   VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
           << ", CPU core IDs: " << MakeString(cpu_ids);
-
-#ifdef MACE_ENABLE_OPENMP
   omp_set_num_threads(omp_num_threads);
 #else
-  LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+  LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
 
   // compute mask
@@ -147,11 +148,13 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 #ifdef MACE_ENABLE_OPENMP
 #pragma omp parallel for
   for (int i = 0; i < omp_num_threads; ++i) {
+    VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
+            << "/" << omp_get_num_threads();
     SetThreadAffinity(mask);
   }
 #else
   SetThreadAffinity(mask);
-  LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0];
+  VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
 #endif
 }
 
@@ -163,7 +166,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
       omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
     }
 #else
-    LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+    LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
     return MACE_SUCCESS;
   }
@@ -192,7 +195,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
 
 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
                                  CPUAffinityPolicy policy) {
-  VLOG(1) << "Set CPU openmp num_threads_hint: " << num_threads_hint
+  VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
           << ", affinity policy: " << policy;
   return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy);
 }
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 9b94eb92ec15eb187d76fdf429e14712995baf09..96b083cfcbd0fe9b5fea8cbd2862a69e0700fec5 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -44,18 +44,30 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 OpenCLAllocator::OpenCLAllocator() {}
 
 OpenCLAllocator::~OpenCLAllocator() {}
-void *OpenCLAllocator::New(size_t nbytes) const {
+MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
+  if (nbytes == 0) {
+    return MaceStatus::MACE_SUCCESS;
+  }
   VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
   cl_int error;
   cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                       nbytes, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
-  return static_cast<void *>(buffer);
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL Buffer with "
+                 << nbytes << " bytes failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = buffer;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }
 
-void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
-                                const DataType dt) const {
+MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
+                                     const DataType dt,
+                                     void **result) const {
   MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
   VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
           << image_shape[1];
@@ -67,11 +79,17 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
       new cl::Image2D(OpenCLRuntime::Global()->context(),
                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                       image_shape[0], image_shape[1], 0, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error) << " with image shape: ["
-                               << image_shape[0] << ", " << image_shape[1]
-                               << "]";
-
-  return cl_image;
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL image with shape: ["
+                 << image_shape[0] << ", " << image_shape[1]
+                 << "] failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = cl_image;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }
 
 void OpenCLAllocator::Delete(void *buffer) const {
diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h
index 0ec50f61062aad81e2e8b0fc010e2a57b457a833..6304add8583f7b2e47c58cd6e6b186ea43b7f092 100644
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -27,15 +27,16 @@ class OpenCLAllocator : public Allocator {
 
   ~OpenCLAllocator() override;
 
-  void *New(size_t nbytes) const override;
+  MaceStatus New(size_t nbytes, void **result) const override;
 
   /*
    * Use Image2D with RGBA (128-bit) format to represent the image.
    *
    * @ shape : [depth, ..., height, width ].
    */
-  void *NewImage(const std::vector<size_t> &image_shape,
-                 const DataType dt) const override;
+  MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                      const DataType dt,
+                      void **result) const override;
 
   void Delete(void *buffer) const override;
 
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 87a6cb3c1c9ff9f3712c50e07c8c6e0d69f5cf61..1404bf023b6d35351cec21c2e5385cae1366bd74 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -208,7 +208,7 @@ class Tensor {
 
   inline void Clear() {
     MACE_CHECK_NOTNULL(buffer_);
-    buffer_->Clear();
+    buffer_->Clear(raw_size());
   }
 
   inline void Reshape(const std::vector<index_t> &shape) {
@@ -216,16 +216,21 @@ class Tensor {
     MACE_CHECK(raw_size() <= buffer_->size());
   }
 
-  inline void Resize(const std::vector<index_t> &shape) {
+  inline MaceStatus Resize(const std::vector<index_t> &shape) {
     shape_ = shape;
     image_shape_.clear();
     if (buffer_ != nullptr) {
       MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
-      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size())
-        buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+        LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
+                     << raw_size() + EXTRA_BUFFER_PAD_SIZE;
+        return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      }
+      return MaceStatus::MACE_SUCCESS;
     } else {
       MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Buffer(allocator_, raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      buffer_ = new Buffer(allocator_);
+      return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE);
     }
   }
 
@@ -241,13 +246,14 @@ class Tensor {
     is_buffer_owner_ = false;
   }
 
-  inline void ResizeImage(const std::vector<index_t> &shape,
-                          const std::vector<size_t> &image_shape) {
+  inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
+                                const std::vector<size_t> &image_shape) {
     shape_ = shape;
     image_shape_ = image_shape;
     if (buffer_ == nullptr) {
       MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image(image_shape, dtype_);
+      buffer_ = new Image();
+      return buffer_->Allocate(image_shape, dtype_);
     } else {
       MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
       Image *image = dynamic_cast<Image *>(buffer_);
@@ -257,24 +263,27 @@ class Tensor {
                  "): current physical image shape: ", image->image_shape()[0],
                  ", ", image->image_shape()[1], " < logical image shape: ",
                  image_shape[0], ", ", image_shape[1]);
+      return MaceStatus::MACE_SUCCESS;
     }
   }
 
-  inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
+  inline MaceStatus ResizeLike(const Tensor &other) {
+    return ResizeLike(&other);
+  }
 
-  inline void ResizeLike(const Tensor *other) {
+  inline MaceStatus ResizeLike(const Tensor *other) {
     if (other->has_opencl_image()) {
       if (is_buffer_owner_ && buffer_ != nullptr && !has_opencl_image()) {
         delete buffer_;
         buffer_ = nullptr;
       }
-      ResizeImage(other->shape(), other->image_shape_);
+      return ResizeImage(other->shape(), other->image_shape_);
     } else {
       if (is_buffer_owner_ && buffer_ != nullptr && has_opencl_image()) {
         delete buffer_;
         buffer_ = nullptr;
       }
-      Resize(other->shape());
+      return Resize(other->shape());
     }
   }
 
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index bb778f8f3d986a88786697772a9246a30049d25e..545ace635da255256d189b709c5fd9b68545b33b 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -60,9 +60,9 @@ std::vector<std::string> Workspace::Tensors() const {
   return names;
 }
 
-void Workspace::LoadModelTensor(const NetDef &net_def,
-                                DeviceType type,
-                                const unsigned char *model_data) {
+MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
+                                      DeviceType type,
+                                      const unsigned char *model_data) {
   MACE_LATENCY_LOGGER(1, "Load model tensors");
   index_t model_data_size = 0;
   for (auto &const_tensor : net_def.tensors()) {
@@ -82,7 +82,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def,
                      model_data_size));
     } else {
       tensor_buffer_ = std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(type), model_data_size));
+          new Buffer(GetDeviceAllocator(type)));
+      MaceStatus status = tensor_buffer_->Allocate(model_data_size);
+      if (status != MaceStatus::MACE_SUCCESS) {
+        return status;
+      }
       tensor_buffer_->Map(nullptr);
       tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
                            0, model_data_size);
@@ -112,14 +116,16 @@ void Workspace::LoadModelTensor(const NetDef &net_def,
   }
 
   if (type == DeviceType::CPU || type == DeviceType::GPU) {
-    CreateOutputTensorBuffer(net_def, type);
+    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+    if (status != MaceStatus::MACE_SUCCESS) return status;
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
-void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                         DeviceType device_type) {
+MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
+                                               DeviceType device_type) {
   if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
-    return;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   DataType dtype = DataType::DT_INVALID;
@@ -149,14 +155,24 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
       // TODO(liuqi): refactor based on PB
       if (mem_block.mem_id() >= 20000) {
         std::unique_ptr<BufferBase> image_buf(
-            new Image({mem_block.x(), mem_block.y()}, dtype));
+            new Image());
+        MaceStatus status = image_buf->Allocate(
+            {mem_block.x(), mem_block.y()}, dtype);
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                           std::move(image_buf));
       }
     } else {
       if (mem_block.mem_id() < 20000) {
         std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
+            new Buffer(GetDeviceAllocator(device_type)));
+        MaceStatus status = tensor_buf->Allocate(
+            mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE);
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                           std::move(tensor_buf));
       }
@@ -193,6 +209,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
       }
     }
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
 ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 35bdc790c73d74e6fe37bec5368ce61798e9cbac..7399562dd63ad86b7b5b835c3b1eb2f9cda36d71 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -47,14 +47,15 @@ class Workspace {
 
   std::vector<std::string> Tensors() const;
 
-  void LoadModelTensor(const NetDef &net_def,
-                       DeviceType type,
-                       const unsigned char *model_data);
+  MaceStatus LoadModelTensor(const NetDef &net_def,
+                             DeviceType type,
+                             const unsigned char *model_data);
 
   ScratchBuffer *GetScratchBuffer(DeviceType device_type);
 
  private:
-  void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type);
+  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
+                                      DeviceType device_type);
 
   TensorMap tensor_map_;
 
diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h
index 3d3c907e97221eb4970a47f5caba9b69b0e13070..5d2d5f9adb96571ee0a7def3a527cdb23c192d5d 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,
 
 extern void Conv2dNeonK3x3S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK3x3S2(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK5x5S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S2(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S3(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 }  // namespace kernels
diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc
index 0e4ac0eb8c1dcc4ac0c3686d45d80c1f7f3ea266..58b28ddc48a5aa5b880c80e6bbdde8ca32f46e38 100644
--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -24,22 +24,22 @@ namespace kernels {
 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 2) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 2) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 1 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,
 
 void Conv2dNeonK3x3S2(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t c = 0; c < in_channels; ++c) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t c = 0; c < in_shape[1]; ++c) {
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
         const float *in_base = input + b * in_batch_size + c * in_image_size;
         const float
           *filter_ptr = filter + m * in_channels * 9 + c * 9;
diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc
index f4fe7ce7ccbaa3ca69de88da6234a17b66097470..3d77d8f6b5535a386dd2d6ba1a18367e1c189bf1 100644
--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK5x5S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc
index 057b93138cebb226fab6008798ccc273a29e574d..4432f2a05b848fdc9978b5a10fed7985ff3d4cff 100644
--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S2(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S3(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
diff --git a/mace/kernels/arm/depthwise_conv2d_neon.h b/mace/kernels/arm/depthwise_conv2d_neon.h
index 130cd360c434ac7283ad83f93587d4ce9c02e3f9..119867bfe3efdc3791aa267566b59bafc9edf8a4 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
@@ -22,15 +22,9 @@ namespace kernels {
 
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t *in_shape,
+                               const index_t *out_shape,
+                               const int *pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 
 void DepthwiseConv2dNeonK3x3S2(const float *input,
                           const float *filter,
-                          const index_t batch,
-                          const index_t in_height,
-                          const index_t in_width,
-                          const index_t in_channels,
-                          const index_t out_height,
-                          const index_t out_width,
-                          const index_t out_channels,
-                          const int pad_top,
-                          const int pad_left,
+                          const index_t *in_shape,
+                          const index_t *out_shape,
+                          const int *pad_hw,
                           const index_t valid_h_start,
                           const index_t valid_h_stop,
                           const index_t valid_w_start,
diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
index fb36bdaded33d5217f1ccb9ae1d9427204433cc5..443e57b7ec5a3d7bfe473ea42e79252eab7c4305 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
 // Ho = 2, Wo = 4, Co = 1
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
   MACE_UNUSED(valid_w_start);
   MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
       index_t multi_index = m % multiplier;
       const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
       float *out_base = output + b * out_batch_size + m * out_image_size;
       index_t h, w;
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
 
       // top
       for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_width; ++w) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
       }  // h
 #else
       for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
-        for (index_t iw = 0; iw < out_width; ++iw) {
+        for (index_t iw = 0; iw < out_shape[3]; ++iw) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                ih,
@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 #endif
 
       // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 
 void DepthwiseConv2dNeonK3x3S2(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
   MACE_UNUSED(valid_w_start);
   MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
       index_t multi_index = m % multiplier;
       const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
       float *out_base = output + b * out_batch_size + m * out_image_size;
       index_t h, w;
-
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
       // top
       for (h = 0; h < valid_h_start; ++h) {
         for (w = 0; w < out_width; ++w) {
@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
 #endif
 
       // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index dfe5540d16e60bf250685e6a725cede177a2ff97..eb374960cab4eb5d14a29316c4d78ab6fb2c324b 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -84,49 +84,45 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
   void Conv2dGeneral(const float *input,
                      const float *filter,
-                     const index_t batch,
-                     const index_t in_height,
-                     const index_t in_width,
-                     const index_t in_channels,
-                     const index_t out_height,
-                     const index_t out_width,
-                     const index_t out_channels,
-                     const int filter_height,
-                     const int filter_width,
-                     const int stride_h,
-                     const int stride_w,
-                     const int dilation_h,
-                     const int dilation_w,
+                     const index_t *in_shape,
+                     const index_t *out_shape,
+                     const index_t *filter_shape,
+                     const int *stride_hw,
+                     const int *dilation_hw,
                      float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = in_channels * in_image_size;
-    const index_t out_batch_size = out_channels * out_image_size;
-    const index_t filter_size = filter_height * filter_width;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = filter_shape[1] * in_image_size;
+    const index_t out_batch_size = filter_shape[0] * out_image_size;
+    const index_t filter_size = filter_shape[2] * filter_shape[3];
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; m += 4) {
+    for (index_t b = 0; b < in_shape[0]; b++) {
+      for (index_t m = 0; m < filter_shape[0]; m += 4) {
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t out_channels = filter_shape[0];
+        const index_t in_channels = filter_shape[1];
+
+        const int stride_h = stride_hw[0];
+        const int stride_w = stride_hw[1];
+        const int dilation_h = dilation_hw[0];
+        const int dilation_w = dilation_hw[1];
         if (m + 3 < out_channels) {
           float *out_ptr0_base =
               output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base =
-              output + b * out_batch_size + (m + 1) * out_image_size;
-          float *out_ptr2_base =
-              output + b * out_batch_size + (m + 2) * out_image_size;
-          float *out_ptr3_base =
-              output + b * out_batch_size + (m + 3) * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input + b * in_batch_size + c * in_image_size;
             const float *filter_ptr0 =
                 filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 =
-                filter + (m + 1) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr2 =
-                filter + (m + 2) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr3 =
-                filter + (m + 3) * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                  // input offset
@@ -144,8 +140,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                    vo3[ow] = out_ptr3_base[out_offset + ow];
                 }
                 // calc by row
-                for (index_t kh = 0; kh < filter_height; ++kh) {
-                  for (index_t kw = 0; kw < filter_width; ++kw) {
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                     // outch 0
                     vo0[0] += in_ptr_base[in_offset
                         + kw * dilation_w] * filter_ptr0[kw];
@@ -185,10 +181,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   }  // kw
 
                   in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_width;
-                  filter_ptr1 += filter_width;
-                  filter_ptr2 += filter_width;
-                  filter_ptr3 += filter_width;
+                  filter_ptr0 += filter_shape[3];
+                  filter_ptr1 += filter_shape[3];
+                  filter_ptr2 += filter_shape[3];
+                  filter_ptr3 += filter_shape[3];
                 }  // kh
 
                 for (index_t ow = 0; ow < 4; ++ow) {
@@ -230,8 +226,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   }
 
                   // calc by row
-                  for (index_t kh = 0; kh < filter_height; ++kh) {
-                    for (index_t kw = 0; kw < filter_width; ++kw) {
+                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                       // outch 0
                       vo0[0] += in_ptr_base[in_offset
                           + kw * dilation_w] * filter_ptr0[kw];
@@ -244,7 +240,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                     }  // kw
 
                     in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_width;
+                    filter_ptr0 += filter_shape[3];
                   }  // kh
 
                   for (index_t ow = 0; ow < 4; ++ow) {
@@ -301,7 +297,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          output_shape.data());
     }
     output->Resize(output_shape);
-    output->Clear();
 
     index_t batch = output->dim(0);
     index_t channels = output->dim(1);
@@ -419,7 +414,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       if (extra_input_width != padded_input_width) {
         pad_right += (extra_input_width - padded_input_width);
       }
-    } else {
+    } else if (!use_neon_1x1_s1) {
       extra_output_height = height;
       extra_input_height =
           std::max(padded_input_height, (extra_output_height - 1) * stride_h
@@ -478,6 +473,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
     Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
     Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+    const index_t extra_input_shape[4] =
+        {batch, input_channels, extra_input_height, extra_input_width};
+    const index_t extra_output_shape[4] =
+        {batch, channels, extra_output_height, extra_output_width};
 
     // decide which convolution function to call
     if (use_winograd) {
@@ -512,6 +511,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
       float *transformed_input_data = transformed_input.mutable_data<float>();
       float *transformed_output_data = transformed_output.mutable_data<float>();
+
       conv_func = [=](const float *pad_input, float *pad_output) {
         WinoGradConv3x3s1(pad_input,
                           transformed_filter_ptr,
@@ -529,26 +529,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK3x3S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_3x3_s2) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK3x3S2(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_1x1_s1) {
@@ -566,71 +556,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK5x5S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s2) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S2(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s3) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S3(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dGeneral(pad_input,
                       filter_data,
-                      batch,
-                      extra_input_height,
-                      extra_input_width,
-                      input_channels,
-                      extra_output_height,
-                      extra_output_width,
-                      channels,
-                      filter_h,
-                      filter_w,
-                      stride_h,
-                      stride_w,
-                      dilation_h,
-                      dilation_w,
+                      extra_input_shape,
+                      extra_output_shape,
+                      filter_shape.data(),
+                      strides_,
+                      dilations_,
                       pad_output);
       };
     }
@@ -639,7 +601,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     const Tensor *pad_input_ptr = input;
     if (extra_input_height != input_height
       || extra_input_width != input_width) {
-      padded_input.Clear();
       ConstructNCHWInputWithSpecificPadding(input,
                                             pad_top,
                                             pad_bottom,
@@ -649,13 +610,17 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       pad_input_ptr = &padded_input;
     }
 
+    // TODO(libin): don't need clear after bias is integrated in each conv
     Tensor *pad_output_ptr = output;
     if (extra_output_height != height || extra_output_width != width) {
       padded_output.Reshape({batch, channels, extra_output_height,
                             extra_output_width});
       padded_output.Clear();
       pad_output_ptr = &padded_output;
+    } else if (!use_neon_1x1_s1) {
+      output->Clear();
     }
+
     const float *pad_input_data = pad_input_ptr->data<float>();
     float *pad_output_data = pad_output_ptr->mutable_data<float>();
 
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index b0f5229f94de2be770f68eb42d538170da8fa7ca..07c72cb35eece69e9b2cefae8b82841dac397524 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -377,6 +377,7 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
   std::vector<index_t> output_shape(
     {batch, channels, height + pad_height, width + pad_width});
   output_tensor->Resize(output_shape);
+  output_tensor->Clear();
   Tensor::MappingGuard padded_output_mapper(output_tensor);
   float *output_data = output_tensor->mutable_data<float>();
 
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 14d78a45fee4b6bd62914b46e72a8e804fa83d0f..7c20adddaa91f7fecff80008afcaa9f2680e323f 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -41,48 +41,40 @@ template<typename T>
 void Deconv2dNCHW(const T *input,
                   const T *filter,
                   const T *bias,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const index_t out_channels,
-                  const index_t filter_height,
-                  const index_t filter_width,
-                  const index_t stride_h,
-                  const index_t stride_w,
-                  const int padding_top,
-                  const int padding_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t *kernel_hw,
+                  const int *strides,
+                  const int *padding,
                   float *output) {
 #pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t oc = 0; oc < out_channels; ++oc) {
-      for (index_t oh = 0; oh < out_height; ++oh) {
-        for (index_t ow = 0; ow < out_width; ++ow) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < out_shape[1]; ++oc) {
+      for (index_t oh = 0; oh < out_shape[2]; ++oh) {
+        for (index_t ow = 0; ow < out_shape[3]; ++ow) {
           index_t filter_start_y, filter_start_x;
-          index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left);
-          index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top);
-          start_x /= stride_w;
-          start_y /= stride_h;
-          filter_start_x = padding_left + stride_w * start_x - ow;
-          filter_start_y = padding_top + stride_h * start_y - oh;
-          filter_start_x = filter_width - 1 - filter_start_x;
-          filter_start_y = filter_height - 1 - filter_start_y;
+          index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
+          index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
+          start_x /= strides[1];
+          start_y /= strides[0];
+          filter_start_x = padding[1] + strides[1] * start_x - ow;
+          filter_start_y = padding[0] + strides[0] * start_y - oh;
+          filter_start_x = kernel_hw[1] - 1 - filter_start_x;
+          filter_start_y = kernel_hw[0] - 1 - filter_start_y;
           T out_value = 0;
           index_t out_pos =
-              ((b * out_channels + oc) * out_height + oh) * out_width + ow;
-          for (index_t ic = 0; ic < in_channels; ++ic) {
+              ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
+          for (index_t ic = 0; ic < in_shape[1]; ++ic) {
             for (index_t f_y = filter_start_y, ih = start_y;
-                 f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) {
+                 f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
               for (index_t f_x = filter_start_x, iw = start_x;
-                  f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) {
+                  f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
                   index_t weight_pos =
-                      ((oc * in_channels + ic) * filter_height + f_y)
-                          * filter_width + f_x;
+                      ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
+                          * kernel_hw[1] + f_x;
                   index_t in_pos =
-                      ((b * in_channels + ic) * in_height + ih)
-                          * in_width + iw;
+                      ((b * in_shape[1] + ic) * in_shape[2] + ih)
+                          * in_shape[3] + iw;
                   out_value += input[in_pos] * filter[weight_pos];
               }
             }
@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                            paddings_.data(), true);
       output->Resize(output_shape_);
     }
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-
     index_t kernel_h = filter->dim(2);
     index_t kernel_w = filter->dim(3);
-    MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-    MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-               input_channels);
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
+    const index_t *in_shape = input->shape().data();
+    const index_t *out_shape = output->shape().data();
+    const index_t kernel_hw[2] = {kernel_h, kernel_w};
+
+    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
+    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
+               in_shape[1]);
+    MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard filter_mapper(filter);
     Tensor::MappingGuard bias_mapper(bias);
@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
     auto filter_data = filter->data<T>();
     auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
     auto output_data = output->mutable_data<T>();
-    int padding_top = (paddings_[0] + 1) >> 1;
-    int padding_left = (paddings_[1] + 1) >> 1;
-
-    deconv::Deconv2dNCHW(input_data, filter_data, bias_data,
-                         batch, input_height, input_width, input_channels,
-                         height, width, channels,
-                         kernel_h, kernel_w,
-                         stride_h, stride_w, padding_top, padding_left,
+    int padding[2];
+    padding[0] = (paddings_[0] + 1) >> 1;
+    padding[1] = (paddings_[1] + 1) >> 1;
+    deconv::Deconv2dNCHW(input_data,
+                         filter_data,
+                         bias_data,
+                         in_shape,
+                         out_shape,
+                         kernel_hw,
+                         strides_,
+                         padding,
                          output_data);
 
-    DoActivation(output_data, output_data, output->size(), activation_,
+    DoActivation(output_data,
+                 output_data,
+                 output->size(),
+                 activation_,
                  relux_max_limit_);
   }
 };
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 733591a5ef8fd906945736848ef3bed6f75c1b10..2afd905b97b77180a61de92f5caea403c4e5ea79 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
     : block_size_(block_size), d2s_(d2s) {}
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
     MACE_UNUSED(future);
-    const int batch_size = input->dim(0);
-    const int input_depth = input->dim(1);
-    const int input_height = input->dim(2);
-    const int input_width = input->dim(3);
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(1);
+    const index_t input_height = input->dim(2);
+    const index_t input_width = input->dim(3);
 
     index_t output_depth, output_width, output_height;
 
@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {
 
     if (d2s_) {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < output_depth; ++d) {
-          for (int h = 0; h < output_height; ++h) {
-            const int in_h = h / block_size_;
-            const int offset_h = (h % block_size_);
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < output_depth; ++d) {
+          for (index_t h = 0; h < output_height; ++h) {
+            const index_t in_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
             for (int w = 0; w < output_width; ++w) {
               const index_t in_w = w / block_size_;
               const index_t offset_w = w % block_size_;
@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
       }
     } else {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < input_depth; ++d) {
-          for (int h = 0; h < input_height; ++h) {
-            const int out_h = h / block_size_;
-            const int offset_h = (h % block_size_);
-            for (int w = 0; w < input_width; ++w) {
-              const int out_w = w / block_size_;
-              const int offset_w = (w % block_size_);
-              const int offset_d =
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < input_depth; ++d) {
+          for (index_t h = 0; h < input_height; ++h) {
+            const index_t out_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
+            for (index_t w = 0; w < input_width; ++w) {
+              const index_t out_w = w / block_size_;
+              const index_t offset_w = (w % block_size_);
+              const index_t offset_d =
                 (offset_h * block_size_ + offset_w) * input_depth;
 
-              const int out_d = d + offset_d;
+              const index_t out_d = d + offset_d;
               const index_t o_index =
                 ((b * output_depth + out_d) * output_height + out_h)
                   * output_width + out_w;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index ce3c1e48551a66915fcf1888dd21d5ced19edeea..a276b504e0c83cd0606b14965936af796be3f06a 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 
   void DepthwiseConv2dGeneral(const float *input,
                               const float *filter,
-                              const index_t batch,
-                              const index_t in_height,
-                              const index_t in_width,
-                              const index_t in_channels,
-                              const index_t out_height,
-                              const index_t out_width,
-                              const index_t out_channels,
-                              const int filter_height,
-                              const int filter_width,
-                              const int stride_h,
-                              const int stride_w,
-                              const int dilation_h,
-                              const int dilation_w,
-                              const int pad_top,
-                              const int pad_left,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              const index_t *filter_shape,
+                              const int *stride_hw,
+                              const int *dilation_hw,
+                              const int *pad_hw,
                               float *output) {
-    const index_t multiplier = out_channels / in_channels;
+    const index_t multiplier = filter_shape[0] / filter_shape[1];
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; ++m) {
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
+    for (index_t b = 0; b < in_shape[0]; ++b) {
+      for (index_t m = 0; m < filter_shape[0]; ++m) {
+        for (index_t h = 0; h < out_shape[2]; ++h) {
+          for (index_t w = 0; w < out_shape[3]; ++w) {
+            const index_t out_channels = filter_shape[0];
+            const index_t in_channels = filter_shape[1];
+            const index_t filter_height = filter_shape[2];
+            const index_t filter_width = filter_shape[3];
+            const index_t in_height = in_shape[2];
+            const index_t in_width = in_shape[3];
+            const index_t out_height = out_shape[2];
+            const index_t out_width = out_shape[3];
             index_t out_offset =
               ((b * out_channels + m) * out_height + h) * out_width + w;
             index_t c = m / multiplier;
@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
             float sum = 0;
             for (index_t kh = 0; kh < filter_height; ++kh) {
               for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_h + kh * dilation_h - pad_top;
-                index_t iw = w * stride_w + kw * dilation_w - pad_left;
+                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
+                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
                 if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                   index_t in_offset =
                     ((b * in_channels + c) * in_height + ih) * in_width + iw;
@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
     auto output_data = output->mutable_data<float>();
 
+    const int pad_hw[2] = {pad_top, pad_left};
+    const index_t input_shape[4] =
+        {batch, input_channels, input_height, input_width};
+
     if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
       && dilation_h == 1 && dilation_w == 1) {
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S1(input,
                                   filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                   valid_h_start,
                                   valid_h_stop,
                                   valid_w_start,
@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S2(input,
                                   filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                   valid_h_start,
                                   valid_h_stop,
                                   valid_w_start,
@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dGeneral(input,
                                filter_data,
-                               batch,
-                               input_height,
-                               input_width,
-                               input_channels,
-                               height,
-                               width,
-                               channels,
-                               filter_h,
-                               filter_w,
-                               stride_h,
-                               stride_w,
-                               dilation_h,
-                               dilation_w,
-                               pad_top,
-                               pad_left,
+                               input_shape,
+                               output_shape.data(),
+                               filter_shape.data(),
+                               strides_,
+                               dilations_,
+                               pad_hw,
                                output);
       };
     }
diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc
index 0fae44de1fa7c1c77195cfd6c93140c6e60c1d05..178e0720a13555d00387ccc57a87f065cd746b42 100644
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -140,8 +140,8 @@ inline void GemmTile(const float *A,
 #endif
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-  for (h = 0; h + 7 < height; h += 8) {
-    for (k = 0; k + 7 < K; k += 8) {
+  for (h = 0; h < height - 7; h += 8) {
+    for (k = 0; k < K - 7; k += 8) {
       const float *a_ptr = A + (h * stride_k + k);
 #ifdef __clang__
       int nw = width >> 2;
@@ -185,156 +185,150 @@ inline void GemmTile(const float *A,
         float *c_ptr7 = C + (h + 7) * stride_w;
 
         asm volatile(
-        "0:                                 \n"
-
-          "prfm   pldl1keep, [%1, #128]       \n"
-          "ld1    {v24.4s}, [%1]              \n"
-
-          // load b: 0-7
-          "prfm   pldl1keep, [%9, #128]       \n"
-          "ld1    {v16.4s}, [%9], #16         \n"
-
-          "prfm   pldl1keep, [%10, #128]      \n"
-          "ld1    {v17.4s}, [%10], #16        \n"
-
-          "prfm   pldl1keep, [%11, #128]      \n"
-          "ld1    {v18.4s}, [%11], #16        \n"
-
-          "prfm   pldl1keep, [%12, #128]      \n"
-          "ld1    {v19.4s}, [%12], #16        \n"
-
-          "prfm   pldl1keep, [%2, #128]       \n"
-          "ld1    {v25.4s}, [%2]              \n"
-
-          "prfm   pldl1keep, [%13, #128]      \n"
-          "ld1    {v20.4s}, [%13], #16        \n"
-
-          "prfm   pldl1keep, [%14, #128]      \n"
-          "ld1    {v21.4s}, [%14], #16        \n"
-
-          "prfm   pldl1keep, [%15, #128]      \n"
-          "ld1    {v22.4s}, [%15], #16        \n"
-
-          "prfm   pldl1keep, [%16, #128]      \n"
-          "ld1    {v23.4s}, [%16], #16        \n"
-
-          "prfm   pldl1keep, [%3, #128]       \n"
-          "ld1    {v26.4s}, [%3]              \n"
-
-          "fmla   v24.4s, v16.4s, %34.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %34.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %34.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %34.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %35.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %35.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %35.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %35.s[3]    \n"
-
-          "st1    {v24.4s}, [%1], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %36.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %36.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %36.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %36.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %37.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %37.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %37.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %37.s[3]    \n"
-
-          "prfm   pldl1keep, [%4, #128]       \n"
-          "ld1    {v24.4s}, [%4]              \n"
-
-          "st1    {v25.4s}, [%2], #16         \n"
-
-          "fmla   v26.4s, v16.4s, %38.s[0]    \n"
-          "fmla   v26.4s, v17.4s, %38.s[1]    \n"
-          "fmla   v26.4s, v18.4s, %38.s[2]    \n"
-          "fmla   v26.4s, v19.4s, %38.s[3]    \n"
+        "prfm   pldl1keep, [%9, #128]       \n"
+        "ld1    {v16.4s}, [%9], #16         \n"
 
-          "fmla   v26.4s, v20.4s, %39.s[0]    \n"
-          "fmla   v26.4s, v21.4s, %39.s[1]    \n"
-          "fmla   v26.4s, v22.4s, %39.s[2]    \n"
-          "fmla   v26.4s, v23.4s, %39.s[3]    \n"
+        "prfm   pldl1keep, [%1, #128]       \n"
+        "ld1    {v18.4s}, [%1]              \n"
 
-          "prfm   pldl1keep, [%5, #128]       \n"
-          "ld1    {v25.4s}, [%5]              \n"
+        "prfm   pldl1keep, [%2, #128]       \n"
+        "ld1    {v19.4s}, [%2]              \n"
 
-          "st1    {v26.4s}, [%3], #16         \n"
-
-          "fmla   v24.4s, v16.4s, %40.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %40.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %40.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %40.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %41.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %41.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %41.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %41.s[3]    \n"
-
-          "prfm   pldl1keep, [%6, #128]       \n"
-          "ld1    {v26.4s}, [%6]              \n"
-
-          "st1    {v24.4s}, [%4], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %42.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %42.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %42.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %42.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %43.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %43.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %43.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %43.s[3]    \n"
-
-          "prfm   pldl1keep, [%7, #128]       \n"
-          "ld1    {v24.4s}, [%7]              \n"
-
-          "st1    {v25.4s}, [%5], #16         \n"
-
-          "fmla   v26.4s, v16.4s, %44.s[0]    \n"
-          "fmla   v26.4s, v17.4s, %44.s[1]    \n"
-          "fmla   v26.4s, v18.4s, %44.s[2]    \n"
-          "fmla   v26.4s, v19.4s, %44.s[3]    \n"
-
-          "fmla   v26.4s, v20.4s, %45.s[0]    \n"
-          "fmla   v26.4s, v21.4s, %45.s[1]    \n"
-          "fmla   v26.4s, v22.4s, %45.s[2]    \n"
-          "fmla   v26.4s, v23.4s, %45.s[3]    \n"
-
-          "prfm   pldl1keep, [%8, #128]       \n"
-          "ld1    {v25.4s}, [%8]              \n"
-
-          "st1    {v26.4s}, [%6], #16         \n"
-
-          "fmla   v24.4s, v16.4s, %46.s[0]    \n"
-          "fmla   v24.4s, v17.4s, %46.s[1]    \n"
-          "fmla   v24.4s, v18.4s, %46.s[2]    \n"
-          "fmla   v24.4s, v19.4s, %46.s[3]    \n"
-
-          "fmla   v24.4s, v20.4s, %47.s[0]    \n"
-          "fmla   v24.4s, v21.4s, %47.s[1]    \n"
-          "fmla   v24.4s, v22.4s, %47.s[2]    \n"
-          "fmla   v24.4s, v23.4s, %47.s[3]    \n"
-
-          "st1    {v24.4s}, [%7], #16         \n"
-
-          "fmla   v25.4s, v16.4s, %48.s[0]    \n"
-          "fmla   v25.4s, v17.4s, %48.s[1]    \n"
-          "fmla   v25.4s, v18.4s, %48.s[2]    \n"
-          "fmla   v25.4s, v19.4s, %48.s[3]    \n"
-
-          "fmla   v25.4s, v20.4s, %49.s[0]    \n"
-          "fmla   v25.4s, v21.4s, %49.s[1]    \n"
-          "fmla   v25.4s, v22.4s, %49.s[2]    \n"
-          "fmla   v25.4s, v23.4s, %49.s[3]    \n"
-
-          "st1    {v25.4s}, [%8], #16         \n"
+        "0:                                 \n"
 
-          "subs   %w0, %w0, #1                \n"
-          "bne    0b                          \n"
-        : "=r"(nw),      // 0
+        "prfm   pldl1keep, [%3, #128]       \n"
+        "ld1    {v20.4s}, [%3]              \n"
+        "prfm   pldl1keep, [%4, #128]       \n"
+        "ld1    {v21.4s}, [%4]              \n"
+        "prfm   pldl1keep, [%5, #128]       \n"
+        "ld1    {v22.4s}, [%5]              \n"
+        "prfm   pldl1keep, [%6, #128]       \n"
+        "ld1    {v23.4s}, [%6]              \n"
+        "prfm   pldl1keep, [%7, #128]       \n"
+        "ld1    {v24.4s}, [%7]              \n"
+        "prfm   pldl1keep, [%8, #128]       \n"
+        "ld1    {v25.4s}, [%8]              \n"
+        "prfm   pldl1keep, [%10, #128]      \n"
+        "ld1    {v17.4s}, [%10], #16        \n"
+
+        "fmla   v18.4s, v16.4s, %34.s[0]    \n"
+        "fmla   v19.4s, v16.4s, %35.s[0]    \n"
+        "fmla   v20.4s, v16.4s, %36.s[0]    \n"
+        "fmla   v21.4s, v16.4s, %37.s[0]    \n"
+
+        "fmla   v22.4s, v16.4s, %38.s[0]    \n"
+        "fmla   v23.4s, v16.4s, %39.s[0]    \n"
+        "fmla   v24.4s, v16.4s, %40.s[0]    \n"
+        "fmla   v25.4s, v16.4s, %41.s[0]    \n"
+
+        "fmla   v18.4s, v17.4s, %34.s[1]    \n"
+        "fmla   v19.4s, v17.4s, %35.s[1]    \n"
+        "fmla   v20.4s, v17.4s, %36.s[1]    \n"
+        "fmla   v21.4s, v17.4s, %37.s[1]    \n"
+
+        "prfm   pldl1keep, [%11, #128]      \n"
+        "ld1    {v16.4s}, [%11], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %38.s[1]    \n"
+        "fmla   v23.4s, v17.4s, %39.s[1]    \n"
+        "fmla   v24.4s, v17.4s, %40.s[1]    \n"
+        "fmla   v25.4s, v17.4s, %41.s[1]    \n"
+
+        "fmla   v18.4s, v16.4s, %34.s[2]    \n"
+        "fmla   v19.4s, v16.4s, %35.s[2]    \n"
+        "fmla   v20.4s, v16.4s, %36.s[2]    \n"
+        "fmla   v21.4s, v16.4s, %37.s[2]    \n"
+
+        "prfm   pldl1keep, [%12, #128]      \n"
+        "ld1    {v17.4s}, [%12], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %38.s[2]    \n"
+        "fmla   v23.4s, v16.4s, %39.s[2]    \n"
+        "fmla   v24.4s, v16.4s, %40.s[2]    \n"
+        "fmla   v25.4s, v16.4s, %41.s[2]    \n"
+
+        "fmla   v18.4s, v17.4s, %34.s[3]    \n"
+        "fmla   v19.4s, v17.4s, %35.s[3]    \n"
+        "fmla   v20.4s, v17.4s, %36.s[3]    \n"
+        "fmla   v21.4s, v17.4s, %37.s[3]    \n"
+
+        "prfm   pldl1keep, [%13, #128]      \n"
+        "ld1    {v16.4s}, [%13], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %38.s[3]    \n"
+        "fmla   v23.4s, v17.4s, %39.s[3]    \n"
+        "fmla   v24.4s, v17.4s, %40.s[3]    \n"
+        "fmla   v25.4s, v17.4s, %41.s[3]    \n"
+
+        "fmla   v18.4s, v16.4s, %42.s[0]    \n"
+        "fmla   v19.4s, v16.4s, %43.s[0]    \n"
+        "fmla   v20.4s, v16.4s, %44.s[0]    \n"
+        "fmla   v21.4s, v16.4s, %45.s[0]    \n"
+
+        "prfm   pldl1keep, [%14, #128]      \n"
+        "ld1    {v17.4s}, [%14], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %46.s[0]    \n"
+        "fmla   v23.4s, v16.4s, %47.s[0]    \n"
+        "fmla   v24.4s, v16.4s, %48.s[0]    \n"
+        "fmla   v25.4s, v16.4s, %49.s[0]    \n"
+
+        "fmla   v18.4s, v17.4s, %42.s[1]    \n"
+        "fmla   v19.4s, v17.4s, %43.s[1]    \n"
+        "fmla   v20.4s, v17.4s, %44.s[1]    \n"
+        "fmla   v21.4s, v17.4s, %45.s[1]    \n"
+
+        "prfm   pldl1keep, [%15, #128]      \n"
+        "ld1    {v16.4s}, [%15], #16        \n"
+
+        "fmla   v22.4s, v17.4s, %46.s[1]    \n"
+        "fmla   v23.4s, v17.4s, %47.s[1]    \n"
+        "fmla   v24.4s, v17.4s, %48.s[1]    \n"
+        "fmla   v25.4s, v17.4s, %49.s[1]    \n"
+
+        "fmla   v18.4s, v16.4s, %42.s[2]    \n"
+        "fmla   v19.4s, v16.4s, %43.s[2]    \n"
+        "fmla   v20.4s, v16.4s, %44.s[2]    \n"
+        "fmla   v21.4s, v16.4s, %45.s[2]    \n"
+
+        "prfm   pldl1keep, [%16, #128]      \n"
+        "ld1    {v17.4s}, [%16], #16        \n"
+
+        "fmla   v22.4s, v16.4s, %46.s[2]    \n"
+        "fmla   v23.4s, v16.4s, %47.s[2]    \n"
+        "fmla   v24.4s, v16.4s, %48.s[2]    \n"
+        "fmla   v25.4s, v16.4s, %49.s[2]    \n"
+
+        "fmla   v18.4s, v17.4s, %42.s[3]    \n"
+        "fmla   v19.4s, v17.4s, %43.s[3]    \n"
+        "fmla   v20.4s, v17.4s, %44.s[3]    \n"
+        "fmla   v21.4s, v17.4s, %45.s[3]    \n"
+
+        "st1    {v18.4s}, [%1], #16         \n"
+        "st1    {v19.4s}, [%2], #16         \n"
+        "st1    {v20.4s}, [%3], #16         \n"
+        "st1    {v21.4s}, [%4], #16         \n"
+
+        "fmla   v22.4s, v17.4s, %46.s[3]    \n"
+        "fmla   v23.4s, v17.4s, %47.s[3]    \n"
+        "fmla   v24.4s, v17.4s, %48.s[3]    \n"
+        "fmla   v25.4s, v17.4s, %49.s[3]    \n"
+
+        "st1    {v22.4s}, [%5], #16         \n"
+        "st1    {v23.4s}, [%6], #16         \n"
+        "st1    {v24.4s}, [%7], #16         \n"
+        "st1    {v25.4s}, [%8], #16         \n"
+
+        "prfm   pldl1keep, [%9, #128]       \n"
+        "ld1    {v16.4s}, [%9], #16         \n"
+        "prfm   pldl1keep, [%1, #128]       \n"
+        "ld1    {v18.4s}, [%1]              \n"
+        "prfm   pldl1keep, [%2, #128]       \n"
+        "ld1    {v19.4s}, [%2]              \n"
+
+        "subs   %w0, %w0, #1                \n"
+        "bne    0b                          \n"
+        : "=r"(nw),    // 0
         "=r"(c_ptr0),  // 1
         "=r"(c_ptr1),  // 2
         "=r"(c_ptr2),  // 3
@@ -351,7 +345,7 @@ inline void GemmTile(const float *A,
         "=r"(b_ptr5),  // 14
         "=r"(b_ptr6),  // 15
         "=r"(b_ptr7)   // 16
-        : "0"(nw),       // 17
+        : "0"(nw),     // 17
         "1"(c_ptr0),   // 18
         "2"(c_ptr1),   // 19
         "3"(c_ptr2),   // 20
@@ -369,20 +363,20 @@ inline void GemmTile(const float *A,
         "15"(b_ptr6),  // 32
         "16"(b_ptr7),  // 33
         "w"(a0),       // 34
-        "w"(a1),       // 35
-        "w"(a2),       // 36
-        "w"(a3),       // 37
-        "w"(a4),       // 38
-        "w"(a5),       // 39
-        "w"(a6),       // 40
-        "w"(a7),       // 41
-        "w"(a8),       // 42
-        "w"(a9),       // 43
-        "w"(a10),      // 44
-        "w"(a11),      // 45
-        "w"(a12),      // 46
-        "w"(a13),      // 47
-        "w"(a14),      // 48
+        "w"(a2),       // 35
+        "w"(a4),       // 36
+        "w"(a6),       // 37
+        "w"(a8),       // 38
+        "w"(a10),      // 39
+        "w"(a12),      // 40
+        "w"(a14),      // 41
+        "w"(a1),       // 42
+        "w"(a3),       // 43
+        "w"(a5),       // 44
+        "w"(a7),       // 45
+        "w"(a9),       // 46
+        "w"(a11),      // 47
+        "w"(a13),      // 48
         "w"(a15)       // 49
         : "cc", "memory",
         "v16",
@@ -585,7 +579,6 @@ void Gemm(const float *A,
   }
   memset(C, 0, sizeof(float) * batch * height * width);
 
-
   // It is better to use large block size if it fits for fast cache.
   // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
   // the block size should be sqrt(32k / sizeof(T) / 3).
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
index 5cee48620aa0aa6be6600bbbe331016a879c4c54..6b55696614201b24d4492275c2ae219a5038926e 100644
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -45,7 +45,8 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 4587a2cb7b2c8a5cebe0470533a1457bb6937e1a..c47213f593cce3da126555993b5b500e95019414 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -58,7 +58,8 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index f28c9ccc6cda25ec713c108bc1eae2ad3f9a38ed..80fafdbcb3f0f129f6ce97ab6cf57406cb617b60 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -56,7 +56,8 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index b6d2b4b1855d2210cb25fb6b99921800d18a6cba..e50dcf58611360833ae2fbaeff2dc9d2d721c01b 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -49,7 +49,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
@@ -90,7 +91,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   } else {
     std::vector<uint32_t> roundup_gws(lws.size());
     for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+      if (lws[i] != 0)
+        roundup_gws[i] = RoundUp(gws[i], lws[i]);
     }
 
     error = runtime->command_queue().enqueueNDRangeKernel(
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 1bce914c1b817a489f444e32b5284c25a7f0d527..bf629e373b77ce57f24640c342a6ce6fe2c5ab45 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -93,7 +93,8 @@ void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     if (!kernel_error_) {
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 7cb082544f55c2bf72711ec0fe6ec0e8448442eb..d16a3d8a73c14df3282c940f01db5b2848a78d34 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -56,7 +56,8 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 96c15fd8adfe4369d483cc6be424d341fe59b743..239041008bcf08cd898bbe9b7f722d68cb4afdec 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -67,7 +67,8 @@ static void Concat2(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
@@ -148,7 +149,8 @@ static void ConcatN(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 52ed0368fc6fbe824ee2f254394ecfbc6324071b..5b79ea662a3f0d101ca4f4e3ab0faf2ce5f2ffd3 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -100,7 +100,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
index f5600883d850e7f05052e8830b2ab79815ffd15e..5386c4173ad5441be104c7e10c183113223915ef 100644
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -86,7 +86,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc
index 2329984a0c53592c8d73a4b629a63167081c2a33..e44d898168fdfd096fa495d6b076a668f4f1a2fd 100644
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -94,7 +94,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc
index bbcbec6c01126095af0e706986d5c47a1ffa707a..abb4b43effff898009bdc56436f9ae44b16cc40b 100644
--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -65,7 +65,8 @@ void Deconv2dOpencl(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
index fd25f948c355999909bcd670e41ff249dc4e5aea..609ad20516444970013ecc5ba796eafb16c060f3 100644
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -86,7 +86,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index 43a24e662828a5282914c230553220cc2adc30c9..c7800d0ae24e5d4ed7471cc1f1ea3829ddacd9fc 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -97,7 +97,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index 94b4c322bc625fa82f9d2e482c99e5b95fdd41d3..4f0590466b02c1a400682a89b656394bdd7318b3 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -97,7 +97,8 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
index 0022b92380208a7f4eb06dff68a8ec45c18dff39..6e0678daff35642a9c673ecd1a76e4059858aa67 100644
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -74,7 +74,8 @@ void FCWXKernel(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
@@ -200,7 +201,8 @@ void FCWTXKernel(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc
index 09b040dd454fbe1a2d24a85019c66116d3610e70..1cefff9e6ba56f00ed366e16aae268dcd0a78e16 100644
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -86,7 +86,8 @@ void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     if (!kernel_error_) {
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 9a16694a0284f1b6583ee633487b4725283bafea..cc63ed04962938f1c4b75e57ec7a618f06fbf2aa 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -54,7 +54,8 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index 012edd70cd9a8c73a409886d37fad0b29ef8411b..467a595309c32f26de3ae271040045092252d840 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -57,7 +57,8 @@ bool BufferToImageOpImpl(Tensor *buffer,
   if (runtime->IsOutOfRangeCheckEnabled()) {
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+        new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+    kernel_error->Allocate(1);
     kernel_error->Map(nullptr);
     *(kernel_error->mutable_data<char>()) = 0;
     kernel_error->UnMap();
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index bc093c16e3f2b66017fe368436b5f172bb9b3d5f..34fbf659ebab12a5258f87234df7f131701f84cf 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -60,7 +60,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index df2fcbe9223f902c721de2ace2aa7d5b780498c1..8a9f91e90df8c79369793b35f59a49e39af7c6e1 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -72,7 +72,8 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index 1c36b27e37dbb5a5649203c2d8fa37a69f5f3266..0c86cae840ac15ce4a56be3727d8d747cfe9d179 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -78,7 +78,8 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 7944ee88b9845f93438d363069e9afbc4065f873..21fdbca1ecb6b2a492787ffe93601af9466bce96 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -51,7 +51,8 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index 24329be41496ee740e34125a9cfaa08f793d5644..8e5be84509bd5ae4d49fdcfecbd0cdbd7e9f0359 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -70,7 +70,8 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
index fa4850cb95348b460fefde519c77fcd97f818504..c3c45f0b2a18bce690cad718b2d61391d243a0f4 100644
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -70,7 +70,8 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index fcf815281ea783fc184819b8d86ac3480bcadd66..da7dea0b974c09e7cf8e7e45442ff44a95eadfe4 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -39,7 +39,8 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
@@ -138,7 +139,8 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 97a65f1eae53b26d68c7c43e07833d4f01a9c33f..9c510b34d3d9b3d162e7e2ceae0dc0210aec264b 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
   }
 
   void MaxPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
         const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+
         for (index_t h = 0; h < out_height; ++h) {
           for (index_t w = 0; w < out_width; ++w) {
             const index_t out_offset = out_base + h * out_width + w;
             float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                 if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                   index_t input_offset = in_base + inh * in_width + inw;
                   res = std::max(res, input[input_offset]);
@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
   }
 
   void AvgPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
         const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
         for (index_t h = 0; h < out_height; ++h) {
           for (index_t w = 0; w < out_width; ++w) {
             const index_t out_offset = out_base + h * out_width + w;
             float res = 0;
             int block_size = 0;
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                 if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                   index_t input_offset = in_base + inh * in_width + inw;
                   res += input[input_offset];
@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
     const float *input = input_tensor->data<float>();
     float *output = output_tensor->mutable_data<float>();
     const index_t *input_shape = input_tensor->shape().data();
-    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
-
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
-
-    int filter_h = kernels_[0];
-    int filter_w = kernels_[1];
-
-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
-
-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
-
-    int pad_top = paddings[0] / 2;
-    int pad_left = paddings[1] / 2;
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
 
     if (pooling_type_ == PoolingType::MAX) {
       MaxPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                  output);
     } else if (pooling_type_ == PoolingType::AVG) {
       AvgPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                  output);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 5f07425a649992907f6c9d5103fbf156119cdf7f..7c6d3b5690df5928a835c6bb98c7f2db542da20d 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
     const Tensor *input_tensor = this->Input(0);
     const Tensor *alpha_tensor =
         this->InputSize() >= 2 ? this->Input(1) : nullptr;
-    Tensor *output_tensor = this->outputs_[0];
+    Tensor *output_tensor = this->Output(0);
     output_tensor->ResizeLike(input_tensor);
 
     functor_(input_tensor, alpha_tensor, output_tensor, future);
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 319fc108a6699c8e704adec9dc694307a0047bfe..733673a7273dae6cc19f308e8561ef5c5fd16201 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -620,6 +620,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
   static void Near(const Tensor &x, const Tensor &y,
                    const double rel_err,
                    const double abs_err) {
+    MACE_UNUSED(rel_err);
+    MACE_UNUSED(abs_err);
     Equal(x, y);
   }
 };
diff --git a/mace/public/mace.h b/mace/public/mace.h
index cfeeb0cadfb7314d774e9ddfd37e46c6d5c863da..46cf4ac32df4b311aac8741c5356b1a5308f673f 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -56,7 +56,11 @@ class RunMetadata {
 
 const char *MaceVersion();
 
-enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
+enum MaceStatus {
+  MACE_SUCCESS = 0,
+  MACE_INVALID_ARGS = 1,
+  MACE_OUT_OF_RESOURCES = 2
+};
 
 // MACE input/output tensor
 class MaceTensor {
@@ -84,13 +88,14 @@ class MaceTensor {
 
 class MaceEngine {
  public:
-  explicit MaceEngine(const NetDef *net_def,
-                      DeviceType device_type,
-                      const std::vector<std::string> &input_nodes,
-                      const std::vector<std::string> &output_nodes,
-                      const unsigned char *model_data);
+  explicit MaceEngine(DeviceType device_type);
   ~MaceEngine();
 
+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes,
+                  const unsigned char *model_data);
+
   MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                  std::map<std::string, MaceTensor> *outputs);
 
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 22e8c68034808115f71c915a9d36f2eedefe5e6a..4f65e971e0dd3148b6376c47ccf3e93046aa741e 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -128,7 +128,7 @@ def main(unused_args):
                                                        FLAGS.weight_file)
 
         output_graph_def = converter.run()
-        print("Transform model to one that can better run on device.")
+        print("Transform model to one that can better run on device")
         if not FLAGS.runtime:
             cpu_graph_def = copy.deepcopy(output_graph_def)
             option.device = mace_pb2.CPU
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 277727d4041c97141d552740275488650ca4a3a2..8819128a946289907f73af77fb9c3c1cb67c7b61 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -136,23 +136,25 @@ class MaceKeyword(object):
 
 
 class TransformerRule(Enum):
-    REMOVE_IDENTITY_OP = 0
-    TRANSFORM_GLOBAL_POOLING = 1
-    FOLD_SOFTMAX = 2
-    FOLD_BATCHNORM = 3,
-    FOLD_CONV_AND_BN = 4,
-    FOLD_DEPTHWISE_CONV_AND_BN = 5,
-    TRANSFORM_GPU_WINOGRAD = 6,
-    TRANSFORM_ADD_TO_BIASADD = 7,
-    FOLD_BIASADD = 8,
-    FOLD_ACTIVATION = 9,
-    TRANSPOSE_FILTERS = 10,
-    RESHAPE_FC_WEIGHT = 11,
-    TRANSPOSE_DATA_FORMAT = 12,
-    TRANSFORM_GLOBAL_CONV_TO_FC = 13,
-    TRANSFORM_BUFFER_IMAGE = 14,
-    ADD_DEVICE_AND_DATA_TYPE = 15,
-    SORT_BY_EXECUTION = 16
+    REMOVE_USELESS_RESHAPE_OP = 0
+    REMOVE_IDENTITY_OP = 1
+    TRANSFORM_GLOBAL_POOLING = 2
+    FOLD_RESHAPE = 3
+    TRANSFORM_MATMUL_TO_FC = 4
+    FOLD_BATCHNORM = 5
+    FOLD_CONV_AND_BN = 6
+    FOLD_DEPTHWISE_CONV_AND_BN = 7
+    TRANSFORM_GPU_WINOGRAD = 8
+    TRANSFORM_ADD_TO_BIASADD = 9
+    FOLD_BIASADD = 10
+    FOLD_ACTIVATION = 11
+    TRANSPOSE_FILTERS = 12
+    RESHAPE_FC_WEIGHT = 13
+    TRANSPOSE_DATA_FORMAT = 14
+    TRANSFORM_GLOBAL_CONV_TO_FC = 15
+    TRANSFORM_BUFFER_IMAGE = 16
+    ADD_DEVICE_AND_DATA_TYPE = 17
+    SORT_BY_EXECUTION = 18
 
 
 class ConverterInterface(object):
@@ -199,9 +201,11 @@ class ConverterOption(object):
         self._device = mace_pb2.CPU
         self._winograd_enabled = False
         self._transformer_option = [
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
             TransformerRule.REMOVE_IDENTITY_OP,
             TransformerRule.TRANSFORM_GLOBAL_POOLING,
-            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_RESHAPE,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC,
             TransformerRule.FOLD_BATCHNORM,
             TransformerRule.FOLD_CONV_AND_BN,
             TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 6f2247e6075982a28ae038493b20408e6e965f0b..2917562df353872b4a4b5b9f1a9ad398a1e992e3 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -101,9 +101,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
             'AvgPool': self.convert_pooling,
             'MaxPool': self.convert_pooling,
             'Squeeze': self.convert_identity,
+            'MatMul': self.convert_matmul,
             'Identity': self.convert_identity,
             'Reshape': self.convert_reshape,
             'Shape': self.convert_nop,
+            'Transpose': self.convert_transpose,
             'Softmax': self.convert_softmax,
             'ResizeBilinear': self.convert_resize_bilinear,
             'Placeholder': self.convert_nop,
@@ -144,7 +146,8 @@ class TensorflowConverter(base_converter.ConverterInterface):
             for i in xrange(len(op.input)):
                 if op.input[i][-2:] == ':0':
                     op_name = op.input[i][:-2]
-                    if op_name in self._option.input_nodes:
+                    if op_name in self._option.input_nodes \
+                            or op_name in self._option.output_nodes:
                         op.input[i] = op_name
             for i in xrange(len(op.output)):
                 if op.output[i][-2:] == ':0':
@@ -411,6 +414,10 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
         self._skip_tensor.update(tf_op.inputs[-1].name)
 
+    def convert_matmul(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.MatMul.name
+
     def convert_reshape(self, tf_op):
         op = self.convert_general_op(tf_op)
         op.type = MaceOp.Reshape.name
@@ -430,6 +437,20 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
         shape_arg.ints.extend(shape_value)
 
+    def convert_transpose(self, tf_op):
+        perm = tf_op.inputs[1].eval().astype(np.int32)
+        ordered_perm = np.sort(perm)
+
+        mace_check(np.array_equal(perm, ordered_perm),
+                   "Transpose not supported yet, only internal transpose"
+                   " in composed ops might be supported")
+
+        op = self.convert_general_op(tf_op)
+        op.type = 'Identity'
+        del op.input[1:]
+
+        self._skip_tensor.add(tf_op.inputs[1].name)
+
     def convert_mean(self, tf_op):
         op = self.convert_general_op(tf_op)
         del op.input[1:]
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index a1776b2232583a660b304d5d8245d8e1967bc82c..825dd64f2171465247260d3d357d089d4b323fdf 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -53,9 +53,11 @@ class Transformer(base_converter.ConverterInterface):
     def __init__(self, option, model):
         # DO NOT reorder the following transformers
         self._registered_transformers_order = [
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
             TransformerRule.REMOVE_IDENTITY_OP,
             TransformerRule.TRANSFORM_GLOBAL_POOLING,
-            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_RESHAPE,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC,
             TransformerRule.FOLD_BATCHNORM,
             TransformerRule.FOLD_CONV_AND_BN,
             TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
@@ -72,10 +74,14 @@ class Transformer(base_converter.ConverterInterface):
             TransformerRule.SORT_BY_EXECUTION,
         ]
         self._registered_transformers = {
+            TransformerRule.REMOVE_USELESS_RESHAPE_OP:
+                self.remove_useless_reshape_op,
             TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op,
             TransformerRule.TRANSFORM_GLOBAL_POOLING:
                 self.transform_global_pooling,
-            TransformerRule.FOLD_SOFTMAX: self.fold_softmax,
+            TransformerRule.FOLD_RESHAPE: self.fold_reshape,
+            TransformerRule.TRANSFORM_MATMUL_TO_FC:
+                self.transform_matmul_to_fc,
             TransformerRule.FOLD_BATCHNORM: self.fold_batchnorm,
             TransformerRule.FOLD_CONV_AND_BN:
                 self.fold_conv_and_bn,  # data_format related
@@ -161,18 +167,26 @@ class Transformer(base_converter.ConverterInterface):
             for output_tensor in op.output:
                 self._producer[output_tensor] = op
         for input_node in self._option.input_nodes.values():
-            op = mace_pb2.OperatorDef()
-            op.name = self.normalize_op_name(input_node.name)
-            op.type = 'Input'
-            op.output.extend(input_node.name)
-            output_shape = op.output_shape.add()
-            output_shape.dims.extend(input_node.shape)
-            if self._option.device == mace_pb2.CPU:
-                self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-                ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
-            else:
-                ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
-            self._producer[op.output[0]] = op
+            input_node_existed = False
+            for op in self._model.op:
+                if input_node.name in op.output:
+                    input_node_existed = True
+                    break
+            if not input_node_existed:
+                op = mace_pb2.OperatorDef()
+                op.name = self.normalize_op_name(input_node.name)
+                op.type = 'Input'
+                op.output.extend([input_node.name])
+                output_shape = op.output_shape.add()
+                output_shape.dims.extend(input_node.shape)
+                if ConverterUtil.data_format(
+                        self._consumers[input_node.name][0]) \
+                        == DataFormat.NCHW:
+                    self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
+                    ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+                else:
+                    ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+                self._producer[op.output[0]] = op
 
     @staticmethod
     def replace(obj_list, source, target):
@@ -191,6 +205,12 @@ class Transformer(base_converter.ConverterInterface):
     def normalize_op_name(name):
         return name.replace(':', '_')
 
+    def get_tensor_shape(self, tensor):
+        producer = self._producer[tensor]
+        for i in xrange(len(producer.output)):
+            if producer.output[i] == tensor:
+                return list(producer.output_shape[i].dims)
+
     def consumer_count(self, tensor_name):
         return len(self._consumers.get(tensor_name, []))
 
@@ -203,23 +223,68 @@ class Transformer(base_converter.ConverterInterface):
 
         return False
 
-    def replace_output_node(self, op):
-        """if it is an output node, change output node to the op before it"""
-        if self.is_op_output_node(op):
-            real_output_node = self._producer[op.input[0]]
-            self.replace(real_output_node.output, op.input[0], op.output[0])
-            print("change %s to %s" % (real_output_node.name, op.name))
+    def safe_remove_node(self, op, replace_op):
+        """remove op.
+        1. change the inputs of its consumers to the outputs of replace_op
+        2. if the op is output node, change output node to replace op"""
+
+        if replace_op is None:
+            # When no replace op specified, we change the inputs of
+            # its consumers to the input of the op. This handles the case
+            # that the op is identity op and its input is a tensor.
+            mace_check(len(op.output) == 1 and len(op.input) == 1,
+                       "cannot remove op that w/o replace op specified"
+                       " and input/output length > 1" + str(op))
+
+            for consumer_op in self._consumers.get(op.output[0], []):
+                self.replace(consumer_op.input, op.output[0], op.input[0])
+
+            mace_check(op.output[0] not in self._option.output_nodes,
+                       "cannot remove op that is output node")
+        else:
+            mace_check(len(op.output) == len(replace_op.output),
+                       "cannot remove op since len(op.output) "
+                       "!= len(replace_op.output)")
+
+            for i in xrange(len(op.output)):
+                for consumer_op in self._consumers.get(op.output[i], []):
+                    self.replace(consumer_op.input,
+                                 op.output[i],
+                                 replace_op.output[i])
+
+            # if the op is output node, change replace_op output name to the op
+            # output name
+            for i in xrange(len(op.output)):
+                if op.output[i] in self._option.output_nodes:
+                    for consumer in self._consumers.get(
+                            replace_op.output[i], []):
+                        self.replace(consumer.input,
+                                     replace_op.output[i],
+                                     op.output[i])
+                    replace_op.output[i] = op.output[i]
+
+        self._model.op.remove(op)
+
+    def remove_useless_reshape_op(self):
+        net = self._model
+        for op in net.op:
+            if op.type == MaceOp.Reshape.name:
+                shape = list(ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_shape_str).ints)
+                if shape == self.get_tensor_shape(op.input[0]):
+                    print("Remove useless reshape: %s(%s)"
+                          % (op.name, op.type))
+                    op.type = 'Identity'
+
+        return False
 
     def remove_identity_op(self):
         net = self._model
         for op in net.op:
             if op.type == 'Identity':
                 print("Remove identity: %s(%s)" % (op.name, op.type))
-                for consumer_op in self._consumers.get(op.output[0], []):
-                    Transformer.replace(consumer_op.input, op.output[0],
-                                        op.input[0])
-                self.replace_output_node(op)
-                net.op.remove(op)
+                self.safe_remove_node(op,
+                                      self._producer.get(op.input[0], None))
                 return True
 
         return False
@@ -264,10 +329,10 @@ class Transformer(base_converter.ConverterInterface):
                         and len(self._consts[consumer_op.input[1]].dims) == 1:
                     print("Fold batchnorm: %s(%s)" % (op.name, op.type))
                     consumer_op.type = MaceOp.FoldedBatchNorm.name
-                    inputs = [op.input[0], op.input[1], consumer_op.input[1]]
-                    consumer_op.input[:] = inputs[:]
+                    consumer_op.input[:] = [op.input[0], op.input[1],
+                                            consumer_op.input[1]]
 
-                    net.op.remove(op)
+                    self.safe_remove_node(op, None)
                     return True
 
         return False
@@ -514,7 +579,7 @@ class Transformer(base_converter.ConverterInterface):
                     filter.float_data[:] = weight_tensor_value.flat[:]
                     filter.dims[:] = weight_tensor_value.shape[:]
 
-                    net.op.remove(op)
+                    self.safe_remove_node(op, iwt_op)
 
         return False
 
@@ -544,10 +609,8 @@ class Transformer(base_converter.ConverterInterface):
                 consumer_op = self._consumers[op.output[0]][0]
                 if consumer_op.type == MaceOp.BiasAdd.name:
                     print("Fold biasadd: %s(%s)" % (op.name, op.type))
-                    op.name = consumer_op.name
                     op.input.append(consumer_op.input[1])
-                    op.output[0] = consumer_op.output[0]
-                    net.op.remove(consumer_op)
+                    self.safe_remove_node(consumer_op, op)
                     return True
 
         return False
@@ -575,7 +638,7 @@ class Transformer(base_converter.ConverterInterface):
                                 or arg.name == MaceKeyword.mace_activation_max_limit_str:  # noqa
                             op.arg.extend([arg])
 
-                    net.op.remove(consumer_op)
+                    self.safe_remove_node(consumer_op, op)
                     return True
 
         return False
@@ -651,11 +714,14 @@ class Transformer(base_converter.ConverterInterface):
                 op.output.extend([input_node.name])
                 output_shape = op.output_shape.add()
                 output_shape.dims.extend(input_node.shape)
+                self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
 
                 dims_arg = op.arg.add()
                 dims_arg.name = MaceKeyword.mace_dims_str
                 dims_arg.ints.extend([0, 3, 1, 2])
 
+                ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+
             for output_node in self._option.output_nodes.values():
                 output_name = MaceKeyword.mace_output_node_name \
                               + '_' + output_node.name
@@ -673,6 +739,8 @@ class Transformer(base_converter.ConverterInterface):
                 dims_arg.name = MaceKeyword.mace_dims_str
                 dims_arg.ints.extend([0, 2, 3, 1])
 
+                ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+
         return False
 
     def transpose_filters(self):
@@ -695,21 +763,29 @@ class Transformer(base_converter.ConverterInterface):
                         filter_data = filter_data.transpose(3, 2, 0, 1)
                         filter.float_data[:] = filter_data.flat
                         filter.dims[:] = filter_data.shape
+                if op.type == MaceOp.FullyConnected.name:
+                    weight = self._consts[op.input[1]]
+                    weight_data = np.array(weight.float_data).reshape(
+                        weight.dims)
+                    weight_data = weight_data.transpose(1, 0)
+                    weight.float_data[:] = weight_data.flat
+                    weight.dims[:] = weight_data.shape
             self.set_filter_format(FilterFormat.OIHW)
 
         return False
 
     def reshape_fc_weight(self):
+        print("Reshape fully connected weight shape")
         net = self._model
         for op in net.op:
             if op.type == MaceOp.FullyConnected.name:
                 weight = self._consts[op.input[1]]
-                # NCHW
-                input_shape = list(self._producer[op.input[0]]
-                                   .output_shape[0].dims)
-                weight_shape = [weight.dims[0]] + input_shape[1:]
-                del weight.dims[:]
-                weight.dims.extend(weight_shape)
+                input_op = self._producer[op.input[0]]
+                input_shape = list(input_op.output_shape[0].dims)
+                input_data_format = ConverterUtil.data_format(input_op)
+                weight.dims[:] = [weight.dims[0]] + input_shape[1:]
+                if input_data_format == DataFormat.NHWC:
+                    self.transpose_shape(weight.dims, [0, 3, 1, 2])
 
         return False
 
@@ -788,6 +864,8 @@ class Transformer(base_converter.ConverterInterface):
             arg.name = MaceKeyword.mace_buffer_type
             arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
 
+            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
+
         for output_node in self._option.output_nodes.values():
             output_name = MaceKeyword.mace_output_node_name \
                           + '_' + output_node.name
@@ -803,14 +881,16 @@ class Transformer(base_converter.ConverterInterface):
             arg.name = MaceKeyword.mace_buffer_type
             arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
 
+            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
+
         return False
 
-    def fold_softmax(self):
+    def fold_reshape(self):
         changed = False
         net = self._model
         for op in net.op:
-            if op.type == MaceOp.Softmax.name:
-                print("Fold softmax: %s(%s)" % (op.name, op.type))
+            if op.type == MaceOp.Softmax.name or op.type == MaceOp.MatMul.name:
+                print("Fold reshape: %s(%s)" % (op.name, op.type))
                 if self.consumer_count(op.output[0]) == 1:
                     consumer = self._consumers[op.output[0]][0]
                     if consumer.type == MaceOp.Reshape.name:
@@ -818,15 +898,14 @@ class Transformer(base_converter.ConverterInterface):
                                                       MaceKeyword.mace_shape_str).ints  # noqa
                         del op.output_shape[0].dims[:]
                         op.output_shape[0].dims.extend(shape)
-                        self.replace_output_node(consumer)
-                        net.op.remove(consumer)
+                        self.safe_remove_node(consumer, op)
                         changed = True
 
                     producer = self._producer[op.input[0]]
                     if producer.type == MaceOp.Reshape.name:
-                        op.input[0] = producer.input[0]
-                        self.replace_output_node(producer)
-                        net.op.remove(producer)
+                        self.safe_remove_node(producer,
+                                              self._producer[
+                                                  producer.input[0]])
                         changed = True
 
                 if len(op.output_shape[0].dims) < 4:
@@ -839,6 +918,20 @@ class Transformer(base_converter.ConverterInterface):
 
         return False
 
+    def transform_matmul_to_fc(self):
+        net = self._model
+        for op in net.op:
+            if op.type == MaceOp.MatMul.name:
+                input_shape = self.get_tensor_shape(op.input[0])
+                _, h, w, _ = self.sort_feature_map_shape(input_shape,
+                                                         ConverterUtil.data_format(self._producer[op.input[0]]))  # noqa
+                if h == 1 and w == 1 and op.input[1] in self._consts:
+                    weight = self._consts[op.input[1]]
+                    if len(weight.dims) == 2:
+                        op.type = MaceOp.FullyConnected.name
+
+        return False
+
     def transform_global_conv_to_fc(self):
         """Transform global conv to fc should be placed after transposing
         input/output and filter"""
@@ -917,4 +1010,8 @@ class Transformer(base_converter.ConverterInterface):
 
         del net.op[:]
         net.op.extend(sorted_nodes)
+
+        print("Final ops:")
+        for op in net.op:
+            print("%s (%s)" % (op.name, op.type))
         return False
diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2
index d4eb7c309e43d8613200e52e44f965dbd31908fc..2e923d6816725b17ffd8f236083595692f278f72 100644
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -64,26 +64,25 @@ MaceStatus CreateMaceEngineFromCode(
   }
   const unsigned char * model_data = nullptr;
   NetDef net_def;
+  MaceStatus status = MaceStatus::MACE_SUCCESS;
   switch (model_name_map[model_name]) {
 {% for i in range(model_tags |length) %}
    case {{ i }}:
     model_data =
         mace::{{model_tags[i]}}::LoadModelData(model_data_file);
     net_def = mace::{{model_tags[i]}}::CreateNet();
-
-    engine->reset(
-        new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes,
-                             model_data));
+    engine->reset(new mace::MaceEngine(device_type));
+    status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data);
     if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
       mace::{{model_tags[i]}}::UnloadModelData(model_data);
     }
     break;
 {% endfor %}
    default:
-     return MaceStatus::MACE_INVALID_ARGS;
+     status = MaceStatus::MACE_INVALID_ARGS;
   }
 
-  return MaceStatus::MACE_SUCCESS;
+  return status;
 }
 {% else %}
 MaceStatus CreateMaceEngineFromCode(
diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py
index 65369b4625a9ead2c176921d661844cbb731a3d8..493544da30cd95c0a10940db1f4b850f5988d824 100644
--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -95,12 +95,19 @@ def add_shape_const_node(net_def, op, values, name):
 
 
 def convert_op_outputs(mace_op_def, tf_op):
+    mace_op_def.out_max_byte_size.extend(
+        [max_elem_size(output) for output in tf_op.outputs])
     mace_op_def.output_type.extend(
         [tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs])
     output_shapes = []
     for output in tf_op.outputs:
         output_shape = mace_pb2.OutputShape()
-        output_shape.dims.extend(output.shape.as_list())
+        shape_list = output.shape.as_list()
+        if not shape_list:
+            shape_list = [1]
+        elif len(shape_list) == 2:
+            shape_list = [1, 1, shape_list[0], shape_list[1]]
+        output_shape.dims.extend(shape_list)
         output_shapes.append(output_shape)
     mace_op_def.output_shape.extend(output_shapes)
 
@@ -159,8 +166,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
             op_def.input.append(input_tensor.name)
             op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
             op_def.input.extend([min_tensor.name, max_tensor.name])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in quantize_op.outputs])
             convert_op_outputs(op_def, quantize_op)
         elif len(first_op.outputs) > 0 and \
             first_op.type == 'QuantizedReshape' and \
@@ -193,9 +198,71 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
             op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
             op_def.input.extend(
                 [input_tensor.name, min_tensor.name, max_tensor.name])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in quantize_reshape_op.outputs])
             convert_op_outputs(op_def, quantize_reshape_op)
+        # remove Squeeze
+        elif len(first_op.outputs) > 0 and \
+            first_op.type == 'Requantize' and \
+            len(first_op.outputs[0].consumers()) > 0 and \
+            first_op.outputs[0].consumers()[0].type == 'Dequantize' and \
+            len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \
+            > 0 and \
+            first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \
+                == 'Squeeze':
+            dequantize_op = first_op.outputs[0].consumers()[0]
+            squeeze_op = dequantize_op.outputs[0].consumers()[0]
+            reshape_op = squeeze_op.outputs[0].consumers()[0]
+            min_op = reshape_op.outputs[0].consumers()[0]
+            max_op = reshape_op.outputs[0].consumers()[1]
+            quantize_op = min_op.outputs[0].consumers()[0]
+
+            resolved_ops.add(dequantize_op.name)
+            resolved_ops.add(squeeze_op.name)
+            resolved_ops.add(reshape_op.name)
+            resolved_ops.add(min_op.name)
+            resolved_ops.add(max_op.name)
+            resolved_ops.add(quantize_op.name)
+
+            op_def.name = quantize_op.name
+            op_def.input.extend([t.name for t in first_op.inputs])
+            convert_op_outputs(op_def, quantize_op)
+
+            # Squeeze -> Softmax
+            next_op = quantize_op.outputs[0].consumers()[0] \
+                if len(quantize_op.outputs) > 0 else None
+            dequantize_op = next_op.outputs[0].consumers()[0] \
+                if next_op and len(next_op.outputs) > 0 and \
+                next_op.type == 'QuantizedReshape' and \
+                len(next_op.outputs[0].consumers()) > 0 else None
+            softmax_op = dequantize_op.outputs[0].consumers()[0]\
+                if dequantize_op and len(dequantize_op.outputs) > 0 and \
+                dequantize_op.type == 'Dequantize' and \
+                len(dequantize_op.outputs[0].consumers()) > 0 else None
+            if softmax_op and softmax_op.type == 'Softmax':
+                reshape_op = softmax_op.outputs[0].consumers()[0]
+                min_op = reshape_op.outputs[0].consumers()[0]
+                max_op = reshape_op.outputs[0].consumers()[1]
+                quantize_op = min_op.outputs[0].consumers()[0]
+                quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
+
+                resolved_ops.add(next_op.name)
+                resolved_ops.add(dequantize_op.name)
+                resolved_ops.add(softmax_op.name)
+                resolved_ops.add(reshape_op.name)
+                resolved_ops.add(min_op.name)
+                resolved_ops.add(max_op.name)
+                resolved_ops.add(quantize_op.name)
+                resolved_ops.add(quantize_reshape_op.name)
+
+                softmax_op_def = net_def.op.add()
+                softmax_op_def.padding = padding_mode['NA']
+                softmax_op_def.name = quantize_reshape_op.name
+                softmax_op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
+                softmax_op_def.input.extend([
+                    get_tensor_name_from_op(op_def.name, 0),
+                    get_tensor_name_from_op(op_def.name, 1),
+                    get_tensor_name_from_op(op_def.name, 2)])
+                convert_op_outputs(softmax_op_def, quantize_reshape_op)
+
         elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \
             len(first_op.outputs[0].consumers()) > 0 and \
                 first_op.outputs[0].consumers()[0].type == 'Tanh':
@@ -220,8 +287,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
                 op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
                 op_def.input.extend(
                     [input_tensor.name, min_tensor.name, max_tensor.name])
-                op_def.out_max_byte_size.extend(
-                    [max_elem_size(out) for out in quantize_op.outputs])
                 convert_op_outputs(op_def, quantize_op)
             # tanh is last op
             else:
@@ -251,8 +316,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
                     get_tensor_name_from_op(op_def.name, 1),
                     get_tensor_name_from_op(op_def.name, 2)
                 ])
-                new_tanh_op_def.out_max_byte_size.extend(
-                    [max_elem_size(tanh_op.outputs[0])])
                 convert_op_outputs(new_tanh_op_def, tanh_op)
         elif has_padding_and_strides(first_op):
             op_def.padding = padding_mode[first_op.get_attr('padding')]
@@ -266,19 +329,13 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
             strides_tensor = add_shape_const_node(net_def, first_op, strides,
                                                   'strides')
             op_def.input.extend([strides_tensor])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
             convert_op_outputs(op_def, first_op)
         elif is_node_flatten_reshape(first_op):
             op_def.type = 'Flatten'
-            op_def.input.extend([t.name for t in first_op.inputs])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
+            op_def.input.extend([first_op.inputs[0].name])
             convert_op_outputs(op_def, first_op)
         elif dsp_ops.has_op(first_op.type):
             op_def.input.extend([t.name for t in first_op.inputs])
-            op_def.out_max_byte_size.extend(
-                [max_elem_size(out) for out in first_op.outputs])
             convert_op_outputs(op_def, first_op)
         else:
             raise Exception('Unsupported op: ', first_op)
@@ -478,7 +535,8 @@ def fuse_quantize(net_def, input_node, output_node):
                 skip_ops = skip_ops.union(
                     [flatten_op.name, minf_op.name, maxf_op.name])
                 skip_tensors = skip_tensors.union(
-                    [flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+                    [minf_op.input[0], maxf_op.input[0],
+                     quantize_op.input[1], quantize_op.input[2]])
                 quantize_op.type = 'AutoQuantize'
                 del quantize_op.input[1:]
 
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 339aa2f870c5a8c3f8a2942f87d6db2cb04277e2..5d2b018c4704ed6761947b93c161f9baecb05168 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -318,8 +318,10 @@ void MaceRunFunc(const int in_out_size) {
       new FileStorageFactory(file_path));
   mace::SetKVStorageFactory(storage_factory);
 
-  MaceEngine engine(&net_def, device, input_names, output_names,
-                    reinterpret_cast<unsigned char *>(data.data()));
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names,
+      reinterpret_cast<unsigned char *>(data.data()));
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);
 
   std::map<std::string, mace::MaceTensor> inputs;
   std::map<std::string, mace::MaceTensor> outputs;
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index fa1d0e31cc32f75c2cdabd1b42118ddbd11cfe4b..df8270cba9896903496225bfdba19b8d611be589 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -323,8 +323,10 @@ void MaceRun(const int in_out_size,
                          &net_def);
   }
 
-  MaceEngine engine(&net_def, device, input_names, output_names,
-                    reinterpret_cast<unsigned char *>(data.data()));
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names,
+      reinterpret_cast<unsigned char *>(data.data()));
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);
 
   std::map<std::string, mace::MaceTensor> inputs;
   std::map<std::string, mace::MaceTensor> outputs;
diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc
index 52ddca5f2021381dc79d4ce81ff2dbfa3040d470..a8b06e698a1772918467f74101de2921d25e8cb3 100644
--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
@@ -15,6 +15,7 @@
 #include "mace/utils/logging.h"
 
 #include <stdlib.h>
+#include <string.h>
 #if defined(ANDROID) || defined(__ANDROID__)
 #include <android/log.h>
 #include <iostream>