diff --git a/docker/Dockerfile b/docker/Dockerfile
index 32d0cef583f71ff0a19593f3795b441c9103df99..8a8c089c493baa0982fbd68e8fa815ed68ea3e45 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -110,7 +110,8 @@ RUN apt-get install -y --no-install-recommends \
 
 # Install tools
 RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools
-RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.6.0 \
+RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.7.0 \
+    numpy>=1.14.0 \
     scipy \
     jinja2 \
     pyyaml \
diff --git a/docs/getting_started/how_to_build.rst b/docs/getting_started/how_to_build.rst
index 7f2c0d3a3e7030c78dc523b7f7812d0c9360a4d9..43178cf946ac54bc81689bc6e90d993bcc02fc91 100644
--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -33,11 +33,13 @@ How to build
 +=====================+=================+===================================================================================================+
 | bazel               | >= 0.5.4        | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| android-ndk         | r12c            | -                                                                                                 |
+| android-ndk         | r15c,r16b       | -                                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | adb                 | >= 1.0.32       | apt install -y android-tools-adb                                                                  |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
-| tensorflow          | 1.4.0           | pip install tensorflow==1.4.0                                                                     |
+| tensorflow          | 1.7.0           | pip install tensorflow==1.7.0                                                                     |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| numpy               | >= 1.14.0       | pip install numpy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | scipy               | >= 1.0.0        | pip install scipy                                                                                 |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
@@ -45,6 +47,10 @@ How to build
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | PyYaml              | >= 3.12         | pip install pyyaml                                                                                |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| sh                  | >= 1.12.14      | pip install sh                                                                                    |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
+| filelock            | >= 3.0.0        | pip install filelock                                                                              |
++---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 | docker(for caffe)   | >= 17.09.0-ce   | `install doc <https://docs.docker.com/install/linux/docker-ce/ubuntu/#set-up-the-repository>`__   |
 +---------------------+-----------------+---------------------------------------------------------------------------------------------------+
 
@@ -229,29 +235,47 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级
 
     // 引入头文件
     #include "mace/public/mace.h"
-    #include "mace/public/{MODEL_TAG}.h"
+    #include "mace/public/mace_engine_factory.h"
 
-    // 0. 设置内部存储
+    // 0. 设置内部存储（设置一次即可）
     const std::string file_path ="/path/to/store/internel/files";
     std::shared_ptr<KVStorageFactory> storage_factory(
         new FileStorageFactory(file_path));
     ConfigKVStorageFactory(storage_factory);
 
-    //1. 从文件或代码中Load模型数据，也可通过自定义的方式来Load (例如可自己实现压缩加密等)
-    // 如果使用的是数据嵌入的方式，将参数设为nullptr。
-    unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+    //1. 声明设备类型(必须与build时指定的runtime一致）
+    DeviceType device_type = DeviceType::GPU;
 
-    //2. 创建net对象
-    NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
-
-    //3. 声明设备类型(必须与build时指定的runtime一致）
-    DeviceType device_type = DeviceType::OPENCL;
-
-    //4. 定义输入输出名称数组
+    //2. 定义输入输出名称数组
     std::vector<std::string> input_names = {...};
     std::vector<std::string> output_names = {...};
 
-    //5. 创建输入输出对象
+    //3. 创建MaceEngine对象
+    std::shared_ptr<mace::MaceEngine> engine;
+    MaceStatus create_engine_status;
+    // Create Engine
+    if (model_data_file.empty()) {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           nullptr,
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    } else {
+      create_engine_status =
+          CreateMaceEngine(model_name.c_str(),
+                           model_data_file.c_str(),
+                           input_names,
+                           output_names,
+                           device_type,
+                           &engine);
+    }
+    if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+      // do something
+    }
+
+    //4. 创建输入输出对象
     std::map<std::string, mace::MaceTensor> inputs;
     std::map<std::string, mace::MaceTensor> outputs;
     for (size_t i = 0; i < input_count; ++i) {
@@ -276,14 +300,6 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级
       outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
     }
 
-    //6. 创建MaceEngine对象
-    mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-
-    //7. 如果设备类型是OPENCL或HEXAGON，可以在此释放model_data
-    if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
-      mace::MACE_MODEL_TAG::UnloadModelData(model_data);
-    }
-
-    //8. 执行模型，得到结果
+    //5. 执行模型，得到结果
     engine.Run(inputs, &outputs);
 
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 8c73025b4923cd860f3a47f3109cc4325728b259..7ab701ddd21b15b0bb88b258d9b2f85801b8dda2 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -16,6 +16,7 @@
 #define MACE_CORE_ALLOCATOR_H_
 
 #include <stdlib.h>
+#include <string.h>
 #include <map>
 #include <limits>
 #include <vector>
@@ -42,9 +43,10 @@ class Allocator {
  public:
   Allocator() {}
   virtual ~Allocator() noexcept {}
-  virtual void *New(size_t nbytes) const = 0;
-  virtual void *NewImage(const std::vector<size_t> &image_shape,
-                         const DataType dt) const = 0;
+  virtual MaceStatus New(size_t nbytes, void **result) const = 0;
+  virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                              const DataType dt,
+                              void **result) const = 0;
   virtual void Delete(void *data) const = 0;
   virtual void DeleteImage(void *data) const = 0;
   virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
@@ -53,44 +55,54 @@ class Allocator {
                          std::vector<size_t> *mapped_image_pitch) const = 0;
   virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
   virtual bool OnHost() const = 0;
-
-  template <typename T>
-  T *New(size_t num_elements) {
-    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
-      return nullptr;
-    }
-    void *p = New(sizeof(T) * num_elements);
-    T *typed_p = reinterpret_cast<T *>(p);
-    return typed_p;
-  }
 };
 
 class CPUAllocator : public Allocator {
  public:
   ~CPUAllocator() override {}
-  void *New(size_t nbytes) const override {
+  MaceStatus New(size_t nbytes, void **result) const override {
     VLOG(3) << "Allocate CPU buffer: " << nbytes;
+    if (nbytes == 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
     void *data = nullptr;
 #if defined(__ANDROID__) || defined(__hexagon__)
     data = memalign(kMaceAlignment, nbytes);
+    if (data == NULL) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #else
-    MACE_CHECK(posix_memalign(&data, kMaceAlignment, nbytes) == 0);
+    int ret = posix_memalign(&data, kMaceAlignment, nbytes);
+    if (ret != 0) {
+      LOG(WARNING) << "Allocate CPU Buffer with "
+                   << nbytes << " bytes failed because of"
+                   << strerror(errno);
+      *result = nullptr;
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
 #endif
-    MACE_CHECK_NOTNULL(data);
     // TODO(heliangliang) This should be avoided sometimes
     memset(data, 0, nbytes);
-    return data;
+    *result = data;
+    return MaceStatus::MACE_SUCCESS;
   }
 
-  void *NewImage(const std::vector<size_t> &shape,
-                 const DataType dt) const override {
+  MaceStatus NewImage(const std::vector<size_t> &shape,
+                      const DataType dt,
+                      void **result) const override {
     MACE_UNUSED(shape);
     MACE_UNUSED(dt);
+    MACE_UNUSED(result);
     LOG(FATAL) << "Allocate CPU image";
-    return nullptr;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Delete(void *data) const override {
+    MACE_CHECK_NOTNULL(data);
     VLOG(3) << "Free CPU buffer";
     free(data);
   }
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index f4b252a776296b1e065816c3a9b6288d13d03837..afac3f296b1bb44798a1261f639bb612336317ae 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -38,6 +38,11 @@ class BufferBase {
 
   virtual void *raw_mutable_data() = 0;
 
+  virtual MaceStatus Allocate(index_t nbytes) = 0;
+
+  virtual MaceStatus Allocate(const std::vector<size_t> &shape,
+                              DataType data_type) = 0;
+
   virtual void *Map(index_t offset,
                     index_t length,
                     std::vector<size_t> *pitch) const = 0;
@@ -48,7 +53,7 @@ class BufferBase {
 
   virtual void UnMap() = 0;
 
-  virtual void Resize(index_t size) = 0;
+  virtual MaceStatus Resize(index_t nbytes) = 0;
 
   virtual void Copy(void *src, index_t offset, index_t length) = 0;
 
@@ -83,14 +88,6 @@ class Buffer : public BufferBase {
         mapped_buf_(nullptr),
         is_data_owner_(true) {}
 
-  Buffer(Allocator *allocator, index_t size)
-      : BufferBase(size),
-        allocator_(allocator),
-        mapped_buf_(nullptr),
-        is_data_owner_(true) {
-    buf_ = allocator->New(size);
-  }
-
   Buffer(Allocator *allocator, void *data, index_t size)
       : BufferBase(size),
         allocator_(allocator),
@@ -132,6 +129,31 @@ class Buffer : public BufferBase {
     }
   }
 
+  MaceStatus Allocate(index_t nbytes) {
+    if (nbytes <= 0) {
+      return MaceStatus::MACE_SUCCESS;
+    }
+    MACE_CHECK(is_data_owner_,
+               "data is not owned by this buffer, cannot reallocate");
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->Delete(buf_);
+    }
+    size_ = nbytes;
+    return allocator_->New(nbytes, &buf_);
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    if (shape.empty()) return MaceStatus::MACE_SUCCESS;
+    index_t nbytes = std::accumulate(shape.begin(), shape.end(),
+                                     1, std::multiplies<size_t>())
+        * GetEnumTypeSize(data_type);
+    return this->Allocate(nbytes);
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_CHECK_NOTNULL(buf_);
     MACE_UNUSED(pitch);
@@ -154,16 +176,17 @@ class Buffer : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t nbytes) {
     MACE_CHECK(is_data_owner_,
                "data is not owned by this buffer, cannot resize");
-    if (size != size_) {
+    if (nbytes != size_) {
       if (buf_ != nullptr) {
         allocator_->Delete(buf_);
       }
-      size_ = size;
-      buf_ = allocator_->New(size);
+      size_ = nbytes;
+      return allocator_->New(nbytes, &buf_);
     }
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -195,18 +218,6 @@ class Image : public BufferBase {
         buf_(nullptr),
         mapped_buf_(nullptr) {}
 
-  Image(std::vector<size_t> shape, DataType data_type)
-      : BufferBase(
-            std::accumulate(
-                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
-            GetEnumTypeSize(data_type)),
-        allocator_(GetDeviceAllocator(GPU)),
-        mapped_buf_(nullptr) {
-    shape_ = shape;
-    data_type_ = data_type;
-    buf_ = allocator_->NewImage(shape, data_type);
-  }
-
   virtual ~Image() {
     if (mapped_buf_ != nullptr) {
       UnMap();
@@ -233,6 +244,29 @@ class Image : public BufferBase {
 
   std::vector<size_t> image_shape() const { return shape_; }
 
+  MaceStatus Allocate(index_t nbytes) {
+    MACE_UNUSED(nbytes);
+    LOG(FATAL) << "Image should not call this allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    index_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
+        GetEnumTypeSize(data_type);
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->DeleteImage(buf_);
+    }
+    size_ = size;
+    shape_ = shape;
+    data_type_ = data_type;
+    return allocator_->NewImage(shape, data_type, &buf_);
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_UNUSED(offset);
     MACE_UNUSED(length);
@@ -259,9 +293,10 @@ class Image : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
     MACE_UNUSED(size);
     MACE_NOT_IMPLEMENTED;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -339,6 +374,20 @@ class BufferSlice : public BufferBase {
     }
   }
 
+  MaceStatus Allocate(index_t size) {
+    MACE_UNUSED(size);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  MaceStatus Allocate(const std::vector<size_t> &shape,
+                      DataType data_type) {
+    MACE_UNUSED(shape);
+    MACE_UNUSED(data_type);
+    LOG(FATAL) << "BufferSlice should not call allocate function";
+    return MaceStatus::MACE_SUCCESS;
+  }
+
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_UNUSED(offset);
     MACE_UNUSED(length);
@@ -364,9 +413,10 @@ class BufferSlice : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) {
+  MaceStatus Resize(index_t size) {
     MACE_CHECK(size == size_, "resize buffer slice from ", size_,
       " to ", size, " is illegal");
+    return MaceStatus::MACE_SUCCESS;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
@@ -396,20 +446,17 @@ class ScratchBuffer: public Buffer {
     : Buffer(allocator),
       offset_(0) {}
 
-  ScratchBuffer(Allocator *allocator, index_t size)
-    : Buffer(allocator, size),
-      offset_(0) {}
-
   ScratchBuffer(Allocator *allocator, void *data, index_t size)
     : Buffer(allocator, data, size),
       offset_(0) {}
 
   virtual ~ScratchBuffer() {}
 
-  void GrowSize(index_t size) {
+  MaceStatus GrowSize(index_t size) {
     if (size > size_) {
-      Resize(size);
+      return Resize(size);
     }
+    return MaceStatus::MACE_SUCCESS;
   }
 
   BufferSlice Scratch(index_t size) {
diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index dc9cbaa90ed9e96a4f24fd813caa86d0b0fb64fa..83ab4bd19ae48debffd5585fe9035a150ac24dac 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -82,12 +82,14 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 // Mace Engine
 class MaceEngine::Impl {
  public:
-  explicit Impl(const NetDef *net_def,
-                DeviceType device_type,
-                const std::vector<std::string> &input_nodes,
-                const std::vector<std::string> &output_nodes);
+  explicit Impl(DeviceType device_type);
+
   ~Impl();
 
+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes);
+
   MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                  std::map<std::string, MaceTensor> *outputs,
                  RunMetadata *run_metadata);
@@ -104,10 +106,8 @@ class MaceEngine::Impl {
   DISABLE_COPY_AND_ASSIGN(Impl);
 };
 
-MaceEngine::Impl::Impl(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes)
+
+MaceEngine::Impl::Impl(DeviceType device_type)
     : op_registry_(new OperatorRegistry()),
       device_type_(device_type),
       ws_(new Workspace()),
@@ -115,7 +115,12 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
 #ifdef MACE_ENABLE_HEXAGON
       , hexagon_controller_(nullptr)
 #endif
-{
+{}
+
+MaceStatus MaceEngine::Impl::Init(
+    const NetDef *net_def,
+    const std::vector<std::string> &input_nodes,
+    const std::vector<std::string> &output_nodes) {
   LOG(INFO) << "MACE version: " << MaceVersion();
   // Set storage path for internal usage
   for (auto input_name : input_nodes) {
@@ -127,7 +132,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
                       GetDeviceAllocator(device_type_), DT_FLOAT);
   }
 #ifdef MACE_ENABLE_HEXAGON
-  if (device_type == HEXAGON) {
+  if (device_type_ == HEXAGON) {
     hexagon_controller_.reset(new HexagonControlWrapper());
     MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
@@ -143,18 +148,22 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
     }
   } else {
 #endif
-    ws_->LoadModelTensor(*net_def, device_type);
+    MaceStatus status = ws_->LoadModelTensor(*net_def, device_type_);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      return status;
+    }
 
-    // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
+  // Init model
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
                          NetMode::INIT);
     if (!net->Run()) {
       LOG(FATAL) << "Net init run failed";
     }
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type);
+    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceEngine::Impl::~Impl() {
@@ -244,16 +253,17 @@ MaceStatus MaceEngine::Impl::Run(
   return MACE_SUCCESS;
 }
 
-MaceEngine::MaceEngine(const NetDef *net_def,
-                       DeviceType device_type,
-                       const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes) {
-  impl_ = std::unique_ptr<MaceEngine::Impl>(
-      new MaceEngine::Impl(net_def, device_type, input_nodes, output_nodes));
-}
+MaceEngine::MaceEngine(DeviceType device_type):
+    impl_(new MaceEngine::Impl(device_type)) {}
 
 MaceEngine::~MaceEngine() = default;
 
+MaceStatus MaceEngine::Init(const NetDef *net_def,
+                            const std::vector<std::string> &input_nodes,
+                            const std::vector<std::string> &output_nodes) {
+  return impl_->Init(net_def, input_nodes, output_nodes);
+}
+
 MaceStatus MaceEngine::Run(const std::map<std::string, MaceTensor> &inputs,
                            std::map<std::string, MaceTensor> *outputs,
                            RunMetadata *run_metadata) {
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 9b94eb92ec15eb187d76fdf429e14712995baf09..96b083cfcbd0fe9b5fea8cbd2862a69e0700fec5 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -44,18 +44,30 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 OpenCLAllocator::OpenCLAllocator() {}
 
 OpenCLAllocator::~OpenCLAllocator() {}
-void *OpenCLAllocator::New(size_t nbytes) const {
+MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
+  if (nbytes == 0) {
+    return MaceStatus::MACE_SUCCESS;
+  }
   VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
   cl_int error;
   cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                       nbytes, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
-  return static_cast<void *>(buffer);
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL Buffer with "
+                 << nbytes << " bytes failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = buffer;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }
 
-void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
-                                const DataType dt) const {
+MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
+                                     const DataType dt,
+                                     void **result) const {
   MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
   VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
           << image_shape[1];
@@ -67,11 +79,17 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
       new cl::Image2D(OpenCLRuntime::Global()->context(),
                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
                       image_shape[0], image_shape[1], 0, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error) << " with image shape: ["
-                               << image_shape[0] << ", " << image_shape[1]
-                               << "]";
-
-  return cl_image;
+  if (error != CL_SUCCESS) {
+    LOG(WARNING) << "Allocate OpenCL image with shape: ["
+                 << image_shape[0] << ", " << image_shape[1]
+                 << "] failed because of"
+                 << OpenCLErrorToString(error);
+    *result = nullptr;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  } else {
+    *result = cl_image;
+    return MaceStatus::MACE_SUCCESS;
+  }
 }
 
 void OpenCLAllocator::Delete(void *buffer) const {
diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h
index 0ec50f61062aad81e2e8b0fc010e2a57b457a833..6304add8583f7b2e47c58cd6e6b186ea43b7f092 100644
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -27,15 +27,16 @@ class OpenCLAllocator : public Allocator {
 
   ~OpenCLAllocator() override;
 
-  void *New(size_t nbytes) const override;
+  MaceStatus New(size_t nbytes, void **result) const override;
 
   /*
    * Use Image2D with RGBA (128-bit) format to represent the image.
    *
    * @ shape : [depth, ..., height, width ].
    */
-  void *NewImage(const std::vector<size_t> &image_shape,
-                 const DataType dt) const override;
+  MaceStatus NewImage(const std::vector<size_t> &image_shape,
+                      const DataType dt,
+                      void **result) const override;
 
   void Delete(void *buffer) const override;
 
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 87a6cb3c1c9ff9f3712c50e07c8c6e0d69f5cf61..8c9f3e3b6a139f25d1d6f7a03754bc817e245795 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -216,16 +216,19 @@ class Tensor {
     MACE_CHECK(raw_size() <= buffer_->size());
   }
 
-  inline void Resize(const std::vector<index_t> &shape) {
+  inline MaceStatus Resize(const std::vector<index_t> &shape) {
     shape_ = shape;
     image_shape_.clear();
     if (buffer_ != nullptr) {
       MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
-      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size())
-        buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+        return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      }
+      return MaceStatus::MACE_SUCCESS;
     } else {
       MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Buffer(allocator_, raw_size() + EXTRA_BUFFER_PAD_SIZE);
+      buffer_ = new Buffer(allocator_);
+      return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE);
     }
   }
 
@@ -241,13 +244,14 @@ class Tensor {
     is_buffer_owner_ = false;
   }
 
-  inline void ResizeImage(const std::vector<index_t> &shape,
-                          const std::vector<size_t> &image_shape) {
+  inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
+                                const std::vector<size_t> &image_shape) {
     shape_ = shape;
     image_shape_ = image_shape;
     if (buffer_ == nullptr) {
       MACE_CHECK(is_buffer_owner_);
-      buffer_ = new Image(image_shape, dtype_);
+      buffer_ = new Image();
+      return buffer_->Allocate(image_shape, dtype_);
     } else {
       MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
       Image *image = dynamic_cast<Image *>(buffer_);
@@ -257,24 +261,27 @@ class Tensor {
                  "): current physical image shape: ", image->image_shape()[0],
                  ", ", image->image_shape()[1], " < logical image shape: ",
                  image_shape[0], ", ", image_shape[1]);
+      return MaceStatus::MACE_SUCCESS;
     }
   }
 
-  inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
+  inline MaceStatus ResizeLike(const Tensor &other) {
+    return ResizeLike(&other);
+  }
 
-  inline void ResizeLike(const Tensor *other) {
+  inline MaceStatus ResizeLike(const Tensor *other) {
     if (other->has_opencl_image()) {
       if (is_buffer_owner_ && buffer_ != nullptr && !has_opencl_image()) {
         delete buffer_;
         buffer_ = nullptr;
       }
-      ResizeImage(other->shape(), other->image_shape_);
+      return ResizeImage(other->shape(), other->image_shape_);
     } else {
       if (is_buffer_owner_ && buffer_ != nullptr && has_opencl_image()) {
         delete buffer_;
         buffer_ = nullptr;
       }
-      Resize(other->shape());
+      return Resize(other->shape());
     }
   }
 
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 46ae0b48a49aa461fb0efee21826a8bd5fc443aa..1f653a458ecdf98abb9dc6441439dbda6b855a23 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -60,7 +60,7 @@ std::vector<std::string> Workspace::Tensors() const {
   return names;
 }
 
-void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
+MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
   MACE_LATENCY_LOGGER(1, "Load model tensors");
   index_t model_data_size = 0;
   unsigned char *model_data_ptr = nullptr;
@@ -89,7 +89,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
                      model_data_size));
     } else {
       tensor_buffer_ = std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(type), model_data_size));
+          new Buffer(GetDeviceAllocator(type)));
+      MaceStatus status = tensor_buffer_->Allocate(model_data_size);
+      if (status != MaceStatus::MACE_SUCCESS) {
+        return status;
+      }
       tensor_buffer_->Map(nullptr);
       tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
       tensor_buffer_->UnMap();
@@ -120,14 +124,16 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
   }
 
   if (type == DeviceType::CPU || type == DeviceType::GPU) {
-    CreateOutputTensorBuffer(net_def, type);
+    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
+    if (status != MaceStatus::MACE_SUCCESS) return status;
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
-void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                         DeviceType device_type) {
+MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
+                                               DeviceType device_type) {
   if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
-    return;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   DataType dtype = DataType::DT_INVALID;
@@ -157,14 +163,24 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
       // TODO(liuqi): refactor based on PB
       if (mem_block.mem_id() >= 20000) {
         std::unique_ptr<BufferBase> image_buf(
-            new Image({mem_block.x(), mem_block.y()}, dtype));
+            new Image());
+        MaceStatus status = image_buf->Allocate(
+            {mem_block.x(), mem_block.y()}, dtype);
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                           std::move(image_buf));
       }
     } else {
       if (mem_block.mem_id() < 20000) {
         std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
+            new Buffer(GetDeviceAllocator(device_type)));
+        MaceStatus status = tensor_buf->Allocate(
+            mem_block.x() * GetEnumTypeSize(dtype));
+        if (status != MaceStatus::MACE_SUCCESS) {
+          return status;
+        }
         preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                           std::move(tensor_buf));
       }
@@ -201,6 +217,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
       }
     }
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
 ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index e9e11ea38810a8fd71c9871c07cb199c803d3dd2..06236fb987fed16ba42fb3a3ce885d2ccf7dbf72 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -47,12 +47,13 @@ class Workspace {
 
   std::vector<std::string> Tensors() const;
 
-  void LoadModelTensor(const NetDef &net_def, DeviceType type);
+  MaceStatus LoadModelTensor(const NetDef &net_def, DeviceType type);
 
   ScratchBuffer *GetScratchBuffer(DeviceType device_type);
 
  private:
-  void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type);
+  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
+                                      DeviceType device_type);
 
   TensorMap tensor_map_;
 
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 7f8258280728872c32f96cfc49316e69d370eb08..bf8dc7055ecb727ba2c13d8fa5b30edfded49621 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -99,7 +99,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 #pragma omp parallel for collapse(2)
     for (index_t b = 0; b < in_shape[0]; b++) {
       for (index_t m = 0; m < filter_shape[0]; m += 4) {
-        const index_t in_height = in_shape[2];
         const index_t in_width = in_shape[3];
         const index_t out_height = out_shape[2];
         const index_t out_width = out_shape[3];
@@ -322,8 +321,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     index_t dilation_h = dilations_[0];
     index_t dilation_w = dilations_[1];
 
-    const index_t filter_hw[2] = {filter_h, filter_w};
-
     MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
 
     index_t padded_input_height = input_height + paddings[0];
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
index 5cee48620aa0aa6be6600bbbe331016a879c4c54..6b55696614201b24d4492275c2ae219a5038926e 100644
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -45,7 +45,8 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 4587a2cb7b2c8a5cebe0470533a1457bb6937e1a..c47213f593cce3da126555993b5b500e95019414 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -58,7 +58,8 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index f28c9ccc6cda25ec713c108bc1eae2ad3f9a38ed..80fafdbcb3f0f129f6ce97ab6cf57406cb617b60 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -56,7 +56,8 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index 05b9d169c966a117730f866b6d402c94524844a7..e50dcf58611360833ae2fbaeff2dc9d2d721c01b 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -49,7 +49,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 1bce914c1b817a489f444e32b5284c25a7f0d527..bf629e373b77ce57f24640c342a6ce6fe2c5ab45 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -93,7 +93,8 @@ void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     if (!kernel_error_) {
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 7cb082544f55c2bf72711ec0fe6ec0e8448442eb..d16a3d8a73c14df3282c940f01db5b2848a78d34 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -56,7 +56,8 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 96c15fd8adfe4369d483cc6be424d341fe59b743..239041008bcf08cd898bbe9b7f722d68cb4afdec 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -67,7 +67,8 @@ static void Concat2(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
@@ -148,7 +149,8 @@ static void ConcatN(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 52ed0368fc6fbe824ee2f254394ecfbc6324071b..5b79ea662a3f0d101ca4f4e3ab0faf2ce5f2ffd3 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -100,7 +100,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
index f5600883d850e7f05052e8830b2ab79815ffd15e..5386c4173ad5441be104c7e10c183113223915ef 100644
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -86,7 +86,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc
index 2329984a0c53592c8d73a4b629a63167081c2a33..e44d898168fdfd096fa495d6b076a668f4f1a2fd 100644
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -94,7 +94,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc
index bbcbec6c01126095af0e706986d5c47a1ffa707a..abb4b43effff898009bdc56436f9ae44b16cc40b 100644
--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -65,7 +65,8 @@ void Deconv2dOpencl(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
index fd25f948c355999909bcd670e41ff249dc4e5aea..609ad20516444970013ecc5ba796eafb16c060f3 100644
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -86,7 +86,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index 43a24e662828a5282914c230553220cc2adc30c9..c7800d0ae24e5d4ed7471cc1f1ea3829ddacd9fc 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -97,7 +97,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index 94b4c322bc625fa82f9d2e482c99e5b95fdd41d3..4f0590466b02c1a400682a89b656394bdd7318b3 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -97,7 +97,8 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
index 0022b92380208a7f4eb06dff68a8ec45c18dff39..6e0678daff35642a9c673ecd1a76e4059858aa67 100644
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -74,7 +74,8 @@ void FCWXKernel(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
@@ -200,7 +201,8 @@ void FCWTXKernel(cl::Kernel *kernel,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      (*kernel_error)->Allocate(1);
       (*kernel_error)->Map(nullptr);
       *((*kernel_error)->mutable_data<char>()) = 0;
       (*kernel_error)->UnMap();
diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc
index 09b040dd454fbe1a2d24a85019c66116d3610e70..1cefff9e6ba56f00ed366e16aae268dcd0a78e16 100644
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -86,7 +86,8 @@ void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     if (!kernel_error_) {
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 9a16694a0284f1b6583ee633487b4725283bafea..cc63ed04962938f1c4b75e57ec7a618f06fbf2aa 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -54,7 +54,8 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index 012edd70cd9a8c73a409886d37fad0b29ef8411b..467a595309c32f26de3ae271040045092252d840 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -57,7 +57,8 @@ bool BufferToImageOpImpl(Tensor *buffer,
   if (runtime->IsOutOfRangeCheckEnabled()) {
     built_options.emplace("-DOUT_OF_RANGE_CHECK");
     kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+        new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+    kernel_error->Allocate(1);
     kernel_error->Map(nullptr);
     *(kernel_error->mutable_data<char>()) = 0;
     kernel_error->UnMap();
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index bc093c16e3f2b66017fe368436b5f172bb9b3d5f..34fbf659ebab12a5258f87234df7f131701f84cf 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -60,7 +60,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index df2fcbe9223f902c721de2ace2aa7d5b780498c1..8a9f91e90df8c79369793b35f59a49e39af7c6e1 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -72,7 +72,8 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index 1c36b27e37dbb5a5649203c2d8fa37a69f5f3266..0c86cae840ac15ce4a56be3727d8d747cfe9d179 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -78,7 +78,8 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 7944ee88b9845f93438d363069e9afbc4065f873..21fdbca1ecb6b2a492787ffe93601af9466bce96 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -51,7 +51,8 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index 24329be41496ee740e34125a9cfaa08f793d5644..8e5be84509bd5ae4d49fdcfecbd0cdbd7e9f0359 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -70,7 +70,8 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
index fa4850cb95348b460fefde519c77fcd97f818504..c3c45f0b2a18bce690cad718b2d61391d243a0f4 100644
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -70,7 +70,8 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index fcf815281ea783fc184819b8d86ac3480bcadd66..da7dea0b974c09e7cf8e7e45442ff44a95eadfe4 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -39,7 +39,8 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
@@ -138,7 +139,8 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+      kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 02d903fd1a7a9f40395452e40ab91ef74b5ca9be..1bc8c36a3fc0eced343ac66f26ff5eee49ab82dd 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -30,7 +30,11 @@ const char *MaceVersion();
 
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
 
-enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };
+enum MaceStatus {
+  MACE_SUCCESS = 0,
+  MACE_INVALID_ARGS = 1,
+  MACE_OUT_OF_RESOURCES = 2
+};
 
 // MACE input/output tensor
 class MaceTensor {
@@ -61,12 +65,13 @@ class RunMetadata;
 
 class MaceEngine {
  public:
-  explicit MaceEngine(const NetDef *net_def,
-                      DeviceType device_type,
-                      const std::vector<std::string> &input_nodes,
-                      const std::vector<std::string> &output_nodes);
+  explicit MaceEngine(DeviceType device_type);
   ~MaceEngine();
 
+  MaceStatus Init(const NetDef *net_def,
+                  const std::vector<std::string> &input_nodes,
+                  const std::vector<std::string> &output_nodes);
+
   MaceStatus Run(const std::map<std::string, MaceTensor> &inputs,
                  std::map<std::string, MaceTensor> *outputs);
 
diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2
index 3b4994699ebe85e988f0909832a7e5bf37bafb09..3110ab37b3e4fa6f3a47825c604a24e4ae2101d3 100644
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -61,24 +61,25 @@ MaceStatus CreateMaceEngine(
   }
   const unsigned char * model_data = nullptr;
   NetDef net_def;
+  MaceStatus status = MaceStatus::MACE_SUCCESS;
   switch (model_name_map[model_name]) {
 {% for i in range(model_tags |length) %}
    case {{ i }}:
     model_data =
         mace::{{model_tags[i]}}::LoadModelData(model_data_file);
     net_def = mace::{{model_tags[i]}}::CreateNet(model_data);
-    engine->reset(
-        new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes));
+    engine->reset(new mace::MaceEngine(device_type));
+    status = (*engine)->Init(&net_def, input_nodes, output_nodes);
     if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
       mace::{{model_tags[i]}}::UnloadModelData(model_data);
     }
     break;
 {% endfor %}
    default:
-     return MaceStatus::MACE_INVALID_ARGS;
+     status = MaceStatus::MACE_INVALID_ARGS;
   }
 
-  return MaceStatus::MACE_SUCCESS;
+  return status;
 }
 
 }  // namespace mace
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index a1271b28b433c46b5caf5052d3db9562b032dcaf..a4088930dcc57ab1f2ed4976195519747a1810b8 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -304,7 +304,9 @@ void MaceRunFunc(const int in_out_size) {
       new FileStorageFactory(file_path));
   mace::SetKVStorageFactory(storage_factory);
 
-  MaceEngine engine(&net_def, device, input_names, output_names);
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names);
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);
 
   std::map<std::string, mace::MaceTensor> inputs;
   std::map<std::string, mace::MaceTensor> outputs;
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 776fa6744c231aadff21aa592b52e43900f423b5..0b16c762da94f2379fc2eaf4d50eafd4d9873cc0 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -308,7 +308,9 @@ void MaceRun(const int in_out_size,
                          &net_def);
   }
 
-  MaceEngine engine(&net_def, device, input_names, output_names);
+  MaceEngine engine(device);
+  MaceStatus status = engine.Init(&net_def, input_names, output_names);
+  ASSERT_EQ(status, MaceStatus::MACE_SUCCESS);
 
   std::map<std::string, mace::MaceTensor> inputs;
   std::map<std::string, mace::MaceTensor> outputs;