Merge branch 'master' into 'master'

Refactor tensor, accelerate gpu initialization time See merge request !260

Merge branch 'master' into 'master'
Refactor tensor, accelerate gpu initialization time See merge request !260
fe8b202b · Liangliang He · d548b461 · bcbcbf7a · fe8b202b · fe8b202b
24 changed file
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -26,17 +26,17 @@ class Allocator {
 public:
  Allocator() {}
  virtual ~Allocator() noexcept {}
-  virtual void *New(size_t nbytes) = 0;
+  virtual void *New(size_t nbytes) const = 0;
  virtual void *NewImage(const std::vector<size_t> &image_shape,
-                         const DataType dt) = 0;
+                         const DataType dt) const = 0;
-  virtual void Delete(void *data) = 0;
+  virtual void Delete(void *data) const = 0;
-  virtual void DeleteImage(void *data) = 0;
+  virtual void DeleteImage(void *data) const = 0;
-  virtual void *Map(void *buffer, size_t nbytes) = 0;
+  virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
  virtual void *MapImage(void *buffer,
                         const std::vector<size_t> &image_shape,
-                         std::vector<size_t> &mapped_image_pitch) = 0;
+                         std::vector<size_t> *mapped_image_pitch) const = 0;
-  virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
+  virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
-  virtual bool OnHost() = 0;
+  virtual bool OnHost() const = 0;
  template <typename T>
  T *New(size_t num_elements) {
@@ -52,7 +52,7 @@ class Allocator {
 class CPUAllocator : public Allocator {
 public:
  ~CPUAllocator() override {}
-  void *New(size_t nbytes) override {
+  void *New(size_t nbytes) const override {
    VLOG(3) << "Allocate CPU buffer: " << nbytes;
    void *data = nullptr;
 #ifdef __ANDROID__
@@ -67,27 +67,29 @@ class CPUAllocator : public Allocator {
  }
  void *NewImage(const std::vector<size_t> &shape,
-                 const DataType dt) override {
+                 const DataType dt) const override {
    LOG(FATAL) << "Allocate CPU image";
    return nullptr;
  }
-  void Delete(void *data) override {
+  void Delete(void *data) const override {
    VLOG(3) << "Free CPU buffer";
    free(data);
  }
-  void DeleteImage(void *data) override {
+  void DeleteImage(void *data) const override {
    LOG(FATAL) << "Free CPU image";
    free(data);
  };
-  void *Map(void *buffer, size_t nbytes) override { return buffer; }
+  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
+    return (char*)buffer + offset;
+  }
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> &mapped_image_pitch) override {
+                 std::vector<size_t> *mapped_image_pitch) const override {
    return buffer;
  }
-  void Unmap(void *buffer, void *mapper_ptr) override {}
+  void Unmap(void *buffer, void *mapper_ptr) const override {}
-  bool OnHost() override { return true; }
+  bool OnHost() const override { return true; }
 };
 std::map<int32_t, Allocator *> *gAllocatorRegistry();

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+#ifndef MACE_CORE_BUFFER_H_
+#define MACE_CORE_BUFFER_H_
+#include "mace/core/types.h"
+#include "mace/core/allocator.h"
+#include <vector>
+namespace mace {
+class BufferBase {
+ public:
+  BufferBase() : size_(0) {}
+  BufferBase(index_t size) : size_(size) {}
+  virtual ~BufferBase() {}
+  virtual void *buffer() = 0;
+  virtual const void *raw_data() const = 0;
+  virtual void *raw_mutable_data() = 0;
+  virtual void *Map(index_t offset,
+                    index_t length,
+                    std::vector<size_t> *pitch) const = 0;
+  virtual void UnMap(void *mapped_ptr) const = 0;
+  virtual void Map(std::vector<size_t> *pitch) = 0;
+  virtual void UnMap() = 0;
+  virtual void Resize(index_t size) = 0;
+  virtual void Copy(void *src, index_t offset, index_t length) = 0;
+  virtual bool OnHost() const = 0;
+  virtual index_t offset() const {
+    return 0;
+  };
+  template<typename T>
+  const T *data() const {
+    return reinterpret_cast<const T *>(raw_data());
+  }
+  template<typename T>
+  T *mutable_data() {
+    return reinterpret_cast<T *>(raw_mutable_data());
+  }
+  index_t size() const {
+    return size_;
+  }
+ protected:
+  index_t size_;
+};
+class Buffer : public BufferBase {
+ public:
+  Buffer(Allocator *allocator)
+    : BufferBase(0),
+      allocator_(allocator),
+      buf_(nullptr),
+      mapped_buf_(nullptr),
+      is_data_owner_(true) {}
+  Buffer(Allocator *allocator, index_t size)
+    : BufferBase(size),
+      allocator_(allocator),
+      mapped_buf_(nullptr),
+      is_data_owner_(true) {
+    buf_ = allocator->New(size);
+  }
+  Buffer(Allocator *allocator, void *data, index_t size)
+    : BufferBase(size),
+      allocator_(allocator),
+      buf_(data),
+      mapped_buf_(nullptr),
+      is_data_owner_(false) {}
+  virtual ~Buffer() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (is_data_owner_ && buf_ != nullptr) {
+      allocator_->Delete(buf_);
+    }
+  }
+  void *buffer() {
+    MACE_CHECK_NOTNULL(buf_);
+    return buf_;
+  };
+  const void *raw_data() const {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buf_);
+      return buf_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+  void *raw_mutable_data() {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buf_);
+      return buf_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_CHECK_NOTNULL(buf_);
+    return allocator_->Map(buf_, offset, length);
+  }
+  void UnMap(void *mapped_ptr) const {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK_NOTNULL(mapped_ptr);
+    allocator_->Unmap(buf_, mapped_ptr);
+  }
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
+    mapped_buf_ = Map(0, size_, pitch);
+  };
+  void UnMap() {
+    UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  }
+  void Resize(index_t size) {
+    MACE_CHECK(is_data_owner_,
+               "data is not owned by this buffer, cannot resize");
+    if (size != size_) {
+      if (buf_ != nullptr) {
+        allocator_->Delete(buf_);
+      }
+      size_ = size;
+      buf_ = allocator_->New(size);
+    }
+  }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    MACE_CHECK(length <= size_, "out of buffer");
+    memcpy(mapped_buf_, (char *) src + offset, length);
+  }
+  bool OnHost() const {
+    return allocator_->OnHost();
+  }
+ private:
+  Allocator *allocator_;
+  void *buf_;
+  void *mapped_buf_;
+  bool is_data_owner_;
+ DISABLE_COPY_AND_ASSIGN(Buffer);
+};
+class Image : public BufferBase {
+ public:
+  Image()
+    : BufferBase(0),
+      allocator_(GetDeviceAllocator(OPENCL)),
+      buf_(nullptr),
+      mapped_buf_(nullptr) {}
+  Image(std::vector<size_t> shape, DataType data_type)
+    : BufferBase(std::accumulate(shape.begin(), shape.end(),
+                                 1, std::multiplies<index_t>())
+                   * GetEnumTypeSize(data_type)),
+      allocator_(GetDeviceAllocator(OPENCL)),
+      mapped_buf_(nullptr) {
+    shape_ = shape;
+    data_type_ = data_type;
+    buf_ = allocator_->NewImage(shape, data_type);
+  }
+  virtual ~Image() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->DeleteImage(buf_);
+    }
+  }
+  void *buffer() {
+    MACE_CHECK_NOTNULL(buf_);
+    return buf_;
+  };
+  const void *raw_data() const {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    return mapped_buf_;
+  }
+  void *raw_mutable_data() {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    return mapped_buf_;
+  }
+  std::vector<size_t> image_shape() const {
+    return shape_;
+  }
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+  void UnMap(void *mapped_ptr) const {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK_NOTNULL(mapped_ptr);
+    allocator_->Unmap(buf_, mapped_ptr);
+  }
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
+    MACE_CHECK_NOTNULL(pitch);
+    mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
+  };
+  void UnMap() {
+    UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  };
+  void Resize(index_t size) {}
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  bool OnHost() const {
+    return allocator_->OnHost();
+  }
+ private:
+  Allocator *allocator_;
+  std::vector<size_t> shape_;
+  DataType data_type_;
+  void *buf_;
+  void *mapped_buf_;
+ DISABLE_COPY_AND_ASSIGN(Image);
+};
+class BufferSlice : public BufferBase {
+ public:
+  BufferSlice() {}
+  BufferSlice(BufferBase *buffer, index_t offset, index_t length)
+    : BufferBase(buffer->size()),
+      buffer_(buffer),
+      mapped_buf_(nullptr),
+      offset_(offset),
+      length_(length) {
+    MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
+    MACE_CHECK(offset + length <= size_,
+               "buffer slice offset + length (",
+               offset,
+               " + ",
+               length,
+               ") should <= ",
+               size_);
+  }
+  BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
+                                                      other.offset_,
+                                                      other.length_) {}
+  ~BufferSlice() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+  }
+  void *buffer() {
+    return buffer_->buffer();
+  };
+  const void *raw_data() const {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buffer_);
+      return (char *) buffer_->raw_data() + offset_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+  void *raw_mutable_data() {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+  void UnMap(void *mapped_ptr) const {
+    MACE_NOT_IMPLEMENTED;
+  }
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK_NOTNULL(buffer_);
+    MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
+    mapped_buf_ = buffer_->Map(offset_, length_, pitch);
+  };
+  void UnMap() {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    buffer_->UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  };
+  void Resize(index_t size) {
+  }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  index_t offset() const {
+    return offset_;
+  }
+  bool OnHost() const {
+    return buffer_->OnHost();
+  }
+ private:
+  BufferBase *buffer_;
+  void *mapped_buf_;
+  index_t offset_;
+  index_t length_;
+};
+}
+#endif // MACE_CORE_BUFFER_H_
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
    if (!net->Run()) {
      LOG(FATAL) << "Net init run failed";
    }
-    ws_->RemoveUnsedTensor();
    net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
  }
 }

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
                 DeviceType type)
    : op_registry_(op_registry), name_(net_def->name()) {}
-SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
+SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                     const std::shared_ptr<const NetDef> net_def,
                     Workspace *ws,
                     DeviceType type,
                     const NetMode mode)
    :  NetBase(op_registry, net_def, ws, type),
      device_type_(type) {
-  MACE_LATENCY_LOGGER(1, "Constructing SimpleNet ", net_def->name());
+  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
    VLOG(3) << "Creating operator " << operator_def.name() << "("
@@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
  }
 }
-bool SimpleNet::Run(RunMetadata *run_metadata) {
+bool SerialNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
@@ -99,7 +99,7 @@ std::unique_ptr<NetBase> CreateNet(
    Workspace *ws,
    DeviceType type,
    const NetMode mode) {
-  std::unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode));
+  std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
  return net;
 }

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -33,9 +33,9 @@ class NetBase {
  DISABLE_COPY_AND_ASSIGN(NetBase);
 };
-class SimpleNet : public NetBase {
+class SerialNet : public NetBase {
 public:
-  SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
+  SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
            const std::shared_ptr<const NetDef> net_def,
            Workspace *ws,
            DeviceType type,
@@ -47,7 +47,7 @@ class SimpleNet : public NetBase {
  std::vector<std::unique_ptr<OperatorBase> > operators_;
  DeviceType device_type_;
-  DISABLE_COPY_AND_ASSIGN(SimpleNet);
+  DISABLE_COPY_AND_ASSIGN(SerialNet);
 };
 std::unique_ptr<NetBase> CreateNet(

--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
@@ -5,6 +5,7 @@
 #ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
 #define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
+#include <unordered_map>
 #include "mace/core/allocator.h"
 namespace mace {
@@ -13,17 +14,26 @@ class PreallocatedPooledAllocator {
 public:
  PreallocatedPooledAllocator() {}
-  virtual ~PreallocatedPooledAllocator() noexcept {}
+  ~PreallocatedPooledAllocator() noexcept {}
-  virtual void PreallocateImage(int mem_id,
+  void SetBuffer(int mem_id, std::unique_ptr<BufferBase> &&buffer) {
-                                const std::vector<size_t> &image_shape,
+    buffers_[mem_id] = std::move(buffer);
-                                DataType data_type) = 0;
+  }
-  virtual void *GetImage(int mem_id) = 0;
+  BufferBase *GetBuffer(int mem_id) {
+    if (buffers_.find(mem_id) != buffers_.end()) {
+      return buffers_[mem_id].get();
+    } else {
+      return nullptr;
+    }
+  }
-  virtual bool HasImage(int mem_id) = 0;
+  virtual bool HasBuffer(int mem_id) {
+    return buffers_.find(mem_id) != buffers_.end();
+  }
-  virtual std::vector<size_t> GetImageSize(int mem_id) = 0;
+ private:
+  std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
 };
 } // namespace mace

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -11,7 +11,6 @@
 #include "mace/core/runtime/hexagon/quantize.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/serializer.h"
 namespace mace {
@@ -37,7 +36,6 @@ class HexagonControlWrapper {
  void SetGraphMode(int mode);
 private:
-  // CAVEAT: Need offset as HVX library reserves some ids
  static constexpr int NODE_ID_OFFSET = 10000;
  inline uint32_t node_id(uint32_t nodeid) {
@@ -45,7 +43,6 @@ class HexagonControlWrapper {
  }
  int nn_id_;
-  Serializer serializer_;
  Quantizer quantizer_;
  std::vector<std::vector<index_t>> input_shapes_;

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 OpenCLAllocator::OpenCLAllocator() {}
 OpenCLAllocator::~OpenCLAllocator() {}
-void *OpenCLAllocator::New(size_t nbytes) {
+void *OpenCLAllocator::New(size_t nbytes) const {
  VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
  cl_int error;
  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
@@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) {
 }
 void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
-                                const DataType dt) {
+                                const DataType dt) const {
  MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
@@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  return cl_image;
 }
-void OpenCLAllocator::Delete(void *buffer) {
+void OpenCLAllocator::Delete(void *buffer) const {
  VLOG(3) << "Free OpenCL buffer";
  if (buffer != nullptr) {
    cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
@@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) {
  }
 }
-void OpenCLAllocator::DeleteImage(void *buffer) {
+void OpenCLAllocator::DeleteImage(void *buffer) const {
  VLOG(3) << "Free OpenCL image";
  if (buffer != nullptr) {
    cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
@@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
  }
 }
-void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
+void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Global()->command_queue();
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
-      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
                             nbytes, nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS);
  return mapped_ptr;
@@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
 // TODO : there is something wrong with half type.
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
-                                std::vector<size_t> &mapped_image_pitch) {
+                                std::vector<size_t> *mapped_image_pitch) const {
  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  auto cl_image = static_cast<cl::Image2D *>(buffer);
  std::array<size_t, 3> origin = {0, 0, 0};
  std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
-  mapped_image_pitch.resize(2);
+  mapped_image_pitch->resize(2);
  cl_int error;
  void *mapped_ptr =
      OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                                                            origin, region,
-                                                            &mapped_image_pitch[0],
+                                                            mapped_image_pitch->data(),
-                                                            &mapped_image_pitch[1],
+                                                            mapped_image_pitch->data() + 1,
                                                            nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS) << error;
  return mapped_ptr;
 }
-void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
+void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Global()->command_queue();
  MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
                                         nullptr) == CL_SUCCESS);
 }
-bool OpenCLAllocator::OnHost() { return false; }
+bool OpenCLAllocator::OnHost() const { return false; }
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator {
  ~OpenCLAllocator() override;
-  void *New(size_t nbytes) override;
+  void *New(size_t nbytes) const override;
  /*
   * Use Image2D with RGBA (128-bit) format to represent the image.
@@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator {
   * @ shape : [depth, ..., height, width ].
   */
  void *NewImage(const std::vector<size_t> &image_shape,
-                 const DataType dt) override;
+                 const DataType dt) const override;
-  void Delete(void *buffer) override;
+  void Delete(void *buffer) const override;
-  void DeleteImage(void *buffer) override;
+  void DeleteImage(void *buffer) const override;
-  void *Map(void *buffer, size_t nbytes) override;
+  void *Map(void *buffer, size_t offset, size_t nbytes) const override;
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> &mapped_image_pitch) override;
+                 std::vector<size_t> *mapped_image_pitch) const override;
-  void Unmap(void *buffer, void *mapped_ptr) override;
+  void Unmap(void *buffer, void *mapped_ptr) const override;
-  bool OnHost() override;
+  bool OnHost() const override;
 };
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
-//
-// Copyright (c) 2018 XiaoMi All rights reserved.
-//
-#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
-namespace mace {
-OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator()
-  : allocator(GetDeviceAllocator(DeviceType::OPENCL)) {
-}
-OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() {
-}
-void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
-                                                         const std::vector<
-                                                           size_t> &image_shape,
-                                                         DataType data_type) {
-  MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
-  VLOG(2) << "Preallocate OpenCL image: " << mem_id << " "
-          << image_shape[0] << ", " << image_shape[1];
-  images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-    allocator->NewImage(image_shape, data_type), [this](void *p) {
-      this->allocator->DeleteImage(p);
-    }));
-  image_shapes_[mem_id] = image_shape;
-}
-} // namespace mace
--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
-//
-// Copyright (c) 2018 XiaoMi All rights reserved.
-//
-#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
-#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
-#include "mace/core/preallocated_pooled_allocator.h"
-#include <unordered_map>
-namespace mace {
-class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator {
- public:
-  OpenCLPreallocatedPooledAllocator();
-  ~OpenCLPreallocatedPooledAllocator() override;
-  void PreallocateImage(int mem_id,
-                        const std::vector<size_t> &image_shape,
-                        DataType data_type) override;
-  inline void *GetImage(int mem_id) override {
-    MACE_CHECK(HasImage(mem_id), "image does not exist");
-    return images_[mem_id].get();
-  }
-  inline bool HasImage(int mem_id) override {
-    return images_.find(mem_id) != images_.end();
-  }
-  inline std::vector<size_t> GetImageSize(int mem_id) override {
-    return image_shapes_[mem_id];
-  }
- private:
-  std::unordered_map<int, std::unique_ptr<void, std::function<void(void *)>>>
-    images_;
-  std::unordered_map<int, std::vector<size_t>> image_shapes_;
-  Allocator *allocator;
-};
-} // namepsace mace
-#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-#include "mace/core/serializer.h"
-namespace mace {
-std::unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
-                                                   const std::string &name) {
-  MACE_NOT_IMPLEMENTED;
-  return nullptr;
-}
-std::unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &const_tensor,
-                                                DeviceType type) {
-  std::unique_ptr<Tensor> tensor(
-      new Tensor(GetDeviceAllocator(type), const_tensor.data_type()));
-  std::vector<index_t> dims;
-  for (const index_t d : const_tensor.dims()) {
-    dims.push_back(d);
-  }
-  tensor->Resize(dims);
-  switch (const_tensor.data_type()) {
-    case DT_HALF:
-      tensor->Copy<half>(reinterpret_cast<const half *>(const_tensor.data()),
-                         const_tensor.data_size());
-      break;
-    case DT_FLOAT:
-      tensor->Copy<float>(reinterpret_cast<const float *>(const_tensor.data()),
-                          const_tensor.data_size());
-      break;
-    case DT_DOUBLE:
-      tensor->Copy<double>(
-          reinterpret_cast<const double *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT32:
-      tensor->Copy<int32_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT64:
-      tensor->Copy<int64_t>(
-          reinterpret_cast<const int64_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_UINT8:
-      tensor->Copy<uint8_t>(
-          reinterpret_cast<const uint8_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT16:
-      tensor->CopyWithCast<int32_t, uint16_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT8:
-      tensor->CopyWithCast<int32_t, int8_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_UINT16:
-      tensor->CopyWithCast<int32_t, int16_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_BOOL:
-      tensor->CopyWithCast<int32_t, bool>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    default:
-      MACE_NOT_IMPLEMENTED;
-      break;
-  }
-  return tensor;
-}
-}  // namespace mace
--- a/mace/core/serializer.h
+++ b/mace/core/serializer.h
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-#ifndef MACE_CORE_SERIALIZER_H_
-#define MACE_CORE_SERIALIZER_H_
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-namespace mace {
-class Serializer {
- public:
-  Serializer() {}
-  ~Serializer() {}
-  std::unique_ptr<ConstTensor> Serialize(const Tensor &tensor,
-                                         const std::string &name);
-  std::unique_ptr<Tensor> Deserialize(const ConstTensor &const_tensor,
-                                      DeviceType type);
-  DISABLE_COPY_AND_ASSIGN(Serializer);
-};
-}  // namespace mace
-#endif  // MACE_CORE_SERIALIZER_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -5,7 +5,7 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_
-#include "mace/core/allocator.h"
+#include "mace/core/buffer.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
@@ -46,7 +46,6 @@ namespace mace {
  CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
                     , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)
 namespace numerical_chars {
 inline std::ostream &operator<<(std::ostream &os, char c) {
  return std::is_signed<char>::value ? os << static_cast<int>(c)
@@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 class Tensor {
 public:
-  Tensor()
-      : alloc_(GetDeviceAllocator(DeviceType::CPU)),
-        size_(0),
-        dtype_(DT_FLOAT),
-        buffer_(nullptr),
-        data_(nullptr),
-        unused_(false),
-        is_image_(false){
-};
  Tensor(Allocator *alloc, DataType type)
-      : alloc_(alloc),
+    : allocator_(alloc),
-        size_(0),
+      dtype_(type),
-        dtype_(type),
+      buffer_(nullptr),
-        buffer_(nullptr),
+      is_buffer_owner_(true) {};
-        data_(nullptr),
-        unused_(false),
+  Tensor(BufferBase *buffer, DataType dtype)
-        is_image_(false){
+    : dtype_(dtype),
-  };
+      buffer_(buffer),
+      is_buffer_owner_(false) {}
+  Tensor(const BufferSlice &buffer_slice, DataType dtype)
+    : dtype_(dtype),
+      buffer_slice_(buffer_slice),
+      is_buffer_owner_(false) {
+    buffer_ = &buffer_slice_;
+  }
+  Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
  ~Tensor() {
+    if (is_buffer_owner_ && buffer_ != nullptr) {
+      delete buffer_;
+    }
  }
  inline DataType dtype() const { return dtype_; }
@@ -93,182 +95,113 @@ class Tensor {
  inline const std::vector<index_t> &shape() const { return shape_; }
-  inline const std::vector<size_t> &image_shape() const { return image_shape_; }
-  inline const bool is_image() const { return is_image_; }
  inline index_t dim_size() const { return shape_.size(); }
  inline index_t dim(unsigned int index) const {
    MACE_CHECK(index < shape_.size(), "Dim out of range: ",
-                                      index, " >= ", shape_.size());
+               index, " >= ", shape_.size());
    return shape_[index];
  }
-  inline index_t size() const { return size_; }
+  inline index_t size() const {
-  inline index_t raw_size() const { return size_ * SizeOfType(); }
-  inline const bool unused() const { return unused_; }
-  inline int64_t NumElements() const {
    return std::accumulate(shape_.begin(), shape_.end(), 1,
                           std::multiplies<int64_t>());
  }
-  inline const bool OnHost() const { return alloc_->OnHost(); }
+  inline index_t raw_size() const {
+    return size() * SizeOfType();
-  /*
-   * Map the device buffer as CPU buffer to access the data, unmap must be
-   * called later
-   */
-  inline void Map() const {
-    if (!OnHost()) {
-      MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
-      data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType());
-    }
  }
-  inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
+  inline void *buffer() const {
-    MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
+    MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr,
-    data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch);
+               "buffer is null");
+    return buffer_->buffer();
  }
-  /*
+  inline index_t buffer_offset() const {
-   *  Unmap the device buffer
+    return buffer_->offset();
-   */
-  inline void Unmap() const {
-    if (!OnHost()) {
-      MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
-      alloc_->Unmap(buffer_.get(), data_);
-      data_ = nullptr;
-    }
  }
-  void *buffer() const { return buffer_.get(); }
  inline const void *raw_data() const {
-    void *data = MappedBuffer();
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
-    MACE_CHECK(data != nullptr || size_ == 0,
+    return buffer_->raw_data();
-               "The tensor is of non-zero shape, but its data is not allocated "
-               "or mapped yet.");
-    return data;
  }
-  template <typename T>
+  template<typename T>
  inline const T *data() const {
-    return static_cast<const T *>(raw_data());
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return buffer_->data<T>();
  }
  inline void *raw_mutable_data() {
-    void *data = MappedBuffer();
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
-    MACE_CHECK(data != nullptr || size_ == 0,
+    return buffer_->raw_mutable_data();
-               "The tensor is of non-zero shape, but its data is not allocated "
-               "or mapped yet.");
-    return data;
  }
-  template <typename T>
+  template<typename T>
  inline T *mutable_data() {
-    return static_cast<T *>(raw_mutable_data());
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return static_cast<T *>(buffer_->raw_mutable_data());
  }
-  inline void Resize(const std::vector<index_t> &shape) {
+  inline void Reshape(const std::vector<index_t> &shape) {
-    MACE_CHECK(!is_image_ || buffer_ == nullptr,
-               "Resize is not for image, use ResizeImage instead.");
-    is_image_ = false;
    shape_ = shape;
-    index_t size = NumElements();
+    MACE_CHECK(raw_size() <= buffer_->size());
-    if (size_ != size) {
-      size_ = size;
-      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
-      CASES(dtype_,
-            (buffer_ =
-               std::move(std::unique_ptr<void, std::function<void(void *)>>(
-                 alloc_->New(size_ * sizeof(T)),
-                 [this](void *p) {
-                   this->alloc_->Delete(p);
-                 })
-               )));
-    }
  }
-  inline void ResizeImage(const std::vector<index_t> &shape,
+  inline void Resize(const std::vector<index_t> &shape) {
-                          const std::vector<size_t> &image_shape) {
-    MACE_CHECK(is_image_ || buffer_ == nullptr,
-               "ResizeImage is not for buffer, use Resize instead.");
-    is_image_ = true;
    shape_ = shape;
-    index_t size = NumElements();
+    if (buffer_ != nullptr) {
-    if (size_ != size) {
+      buffer_->Resize(raw_size());
-      size_ = size;
+    } else {
-      image_shape_ = image_shape;
+      buffer_ = new Buffer(allocator_, raw_size());
-      if (!preallocated_image_shape_.empty()) {
+      is_buffer_owner_ = true;
-        MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0]
-                     && preallocated_image_shape_[1] >= image_shape[1],
-                   "image shape not large enough: preallocated ",
-                   preallocated_image_shape_[0],
-                   " ",
-                   preallocated_image_shape_[1],
-                   "apply for ",
-                   image_shape[0],
-                   " ",
-                   image_shape[1]);
-      } else {
-        buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-          alloc_->NewImage(image_shape, dtype_),
-          [this](void *p) { this->alloc_->DeleteImage(p); }));
-        preallocated_image_shape_ = image_shape;
-      }
    }
  }
  inline void ResizeLike(const Tensor &other) {
-    if (other.is_image()) {
+    Resize(other.shape());
-      ResizeImage(other.shape(), other.image_shape());
-    } else {
-      Resize(other.shape());
-    }
  }
  inline void ResizeLike(const Tensor *other) {
-    if (other->is_image()) {
+    Resize(other->shape());
-      ResizeImage(other->shape(), other->image_shape());
+  }
-    } else {
-      Resize(other->shape());
+  inline void ResizeImage(const std::vector<index_t> &shape,
+                          const std::vector<size_t> &image_shape) {
+    shape_ = shape;
+    if (buffer_ == nullptr) {
+      buffer_ = new Image(image_shape, dtype_);
+      is_buffer_owner_ = true;
    }
  }
-  inline void PreallocateImage(void *image,
+  inline void CopyBytes(const void *src, size_t size) {
-                               const std::vector<size_t>& image_shape) {
+    MappingGuard guard(this);
-    is_image_ = true;
+    memcpy(buffer_->raw_mutable_data(), src, size);
-    buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-      image, [](void *p) {
-        // tensor does not have ownership of preallocated memory
-      }));
-    preallocated_image_shape_ = image_shape;
  }
-  template <typename T>
+  template<typename T>
-  inline void Copy(const T *src, index_t size) {
+  inline void Copy(const T *src, index_t length) {
-    MACE_CHECK(size == size_, "copy src and dst with different size.");
+    MACE_CHECK(length == size(), "copy src and dst with different size.");
-    CopyBytes(static_cast<const void *>(src), sizeof(T) * size);
+    CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
  }
-  template <typename SrcType, typename DstType>
+  inline void Copy(const Tensor &other) {
-  inline void CopyWithCast(const SrcType *src, size_t size) {
+    dtype_ = other.dtype_;
-    MACE_CHECK(static_cast<index_t>(size) == size_,
+    ResizeLike(other);
-               "copy src and dst with different size.");
+    MappingGuard map_other(&other);
-    std::unique_ptr<DstType[]> buffer(new DstType[size]);
+    CopyBytes(other.raw_data(), other.size() * SizeOfType());
-    for (size_t i = 0; i < size; ++i) {
-      buffer[i] = static_cast<DstType>(src[i]);
-    }
-    CopyBytes(static_cast<const void *>(buffer.get()), sizeof(DstType) * size);
  }
-  inline void CopyBytes(const void *src, size_t size) {
+  inline size_t SizeOfType() const {
-    MappingGuard map_this(this);
+    size_t type_size = 0;
-    memcpy(raw_mutable_data(), src, size);
+    CASES(dtype_, type_size = sizeof(T));
+    return type_size;
+  }
+  inline BufferBase *UnderlyingBuffer() const {
+    return buffer_;
  }
  inline void DebugPrint() const {
@@ -281,8 +214,8 @@ class Tensor {
    os.str("");
    os.clear();
    MappingGuard guard(this);
-    for (int i = 0; i < size_; ++i) {
+    for (int i = 0; i < size(); ++i) {
-      if ( i != 0 && i % shape_[3] == 0) {
+      if (i != 0 && i % shape_[3] == 0) {
        os << "\n";
      }
      CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
@@ -291,37 +224,11 @@ class Tensor {
              << dim(2) << ", " << dim(3) << "], content:\n" << os.str();
  }
-  inline size_t SizeOfType() const {
-    size_t type_size = 0;
-    CASES(dtype_, type_size = sizeof(T));
-    return type_size;
-  }
-  inline void Copy(const Tensor &other) {
-    alloc_ = other.alloc_;
-    dtype_ = other.dtype_;
-    ResizeLike(other);
-    MappingGuard map_other(&other);
-    if (is_image_) {
-      LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
-    } else {
-      CopyBytes(other.raw_data(), size_ * SizeOfType());
-    }
-  }
-  inline void MarkUnused() {
-    this->unused_ = true;
-  }
  class MappingGuard {
   public:
    MappingGuard(const Tensor *tensor) : tensor_(tensor) {
      if (tensor_ != nullptr) {
-        if (tensor_->is_image()) {
+        tensor_->buffer_->Map(&mapped_image_pitch_);
-          tensor_->MapImage(mapped_image_pitch_);
-        } else {
-          tensor_->Map();
-        }
      }
    }
@@ -331,42 +238,29 @@ class Tensor {
    }
    ~MappingGuard() {
-      if (tensor_ != nullptr) tensor_->Unmap();
+      if (tensor_ != nullptr) tensor_->buffer_->UnMap();
    }
-    inline const std::vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
+    inline const std::vector<size_t> &mapped_image_pitch() const {
+      return mapped_image_pitch_;
+    }
   private:
    const Tensor *tensor_;
    std::vector<size_t> mapped_image_pitch_;
-    DISABLE_COPY_AND_ASSIGN(MappingGuard);
+   DISABLE_COPY_AND_ASSIGN(MappingGuard);
  };
 private:
-  inline void *MappedBuffer() const {
+  Allocator *allocator_;
-    if (OnHost()) {
-      return buffer_.get();
-    }
-    return data_;
-  }
-  Allocator *alloc_;
-  index_t size_;
  DataType dtype_;
-  // Raw buffer, must be mapped as host accessable data before
-  // read or write
-  std::unique_ptr<void, std::function<void(void*)>> buffer_;
-  // Mapped buffer
-  mutable void *data_;
  std::vector<index_t> shape_;
-  // Image for opencl
+  BufferBase *buffer_;
-  bool unused_;
+  BufferSlice buffer_slice_;
-  bool is_image_;
+  bool is_buffer_owner_;
-  std::vector<size_t> image_shape_;
-  std::vector<size_t> preallocated_image_shape_;
-  DISABLE_COPY_AND_ASSIGN(Tensor);
+ DISABLE_COPY_AND_ASSIGN(Tensor);
 };
 }  // namespace tensor

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -6,21 +6,11 @@
 #include <vector>
 #include "mace/core/workspace.h"
-#include "mace/core/serializer.h"
 #include "mace/core/arg_helper.h"
-#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
 #include "mace/utils/timer.h"
 namespace mace {
-std::vector<std::string> Workspace::Tensors() const {
-  std::vector<std::string> names;
-  for (auto &entry : tensor_map_) {
-    names.push_back(entry.first);
-  }
-  return names;
-}
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
                                DataType type) {
@@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name,
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
-    tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
+    tensor_map_[name] =
+      std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
  }
  return GetTensor(name);
 }
-bool Workspace::RemoveTensor(const std::string &name) {
-  auto it = tensor_map_.find(name);
-  if (it != tensor_map_.end()) {
-    VLOG(3) << "Removing blob " << name << " from this workspace.";
-    tensor_map_.erase(it);
-    return true;
-  }
-  return false;
-}
 const Tensor *Workspace::GetTensor(const std::string &name) const {
  if (tensor_map_.count(name)) {
    return tensor_map_.at(name).get();
@@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
  return nullptr;
 }
-void Workspace::RemoveUnsedTensor() {
-  auto iter = tensor_map_.begin();
-  auto end_iter = tensor_map_.end();
-  while(iter != end_iter) {
-    auto old_iter = iter++;
-    if(old_iter->second->unused()) {
-      tensor_map_.erase(old_iter);
-    }
-  }
-}
 Tensor *Workspace::GetTensor(const std::string &name) {
  return const_cast<Tensor *>(
-      static_cast<const Workspace *>(this)->GetTensor(name));
+    static_cast<const Workspace *>(this)->GetTensor(name));
+}
+std::vector<std::string> Workspace::Tensors() const {
+  std::vector<std::string> names;
+  for (auto &entry : tensor_map_) {
+    names.push_back(entry.first);
+  }
+  return names;
 }
 void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  MACE_LATENCY_LOGGER(1, "Load model tensors");
-  Serializer serializer;
+  index_t model_data_size = 0;
+  unsigned char *model_data_ptr = nullptr;
+  for (auto &const_tensor : net_def.tensors()) {
+    if (model_data_ptr == nullptr
+      || reinterpret_cast<long long>(const_tensor.data())
+        < reinterpret_cast<long long>(model_data_ptr)) {
+      model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
+    }
+  }
+  for (auto &const_tensor : net_def.tensors()) {
+    model_data_size = std::max(model_data_size,
+                               static_cast<index_t>(
+                                 (reinterpret_cast<long long>(const_tensor.data())
+                                   - reinterpret_cast<long long>(model_data_ptr))
+                                   + const_tensor.data_size()
+                                     * GetEnumTypeSize(const_tensor.data_type())));
+  }
+  VLOG(3) << "Model data size: " << model_data_size;
+  if (type == DeviceType::CPU) {
+    tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
+      new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
+  } else {
+    tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
+      new Buffer(GetDeviceAllocator(type), model_data_size)));
+    tensor_buffer_->Map(nullptr);
+    tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
+    tensor_buffer_->UnMap();
+  }
  for (auto &const_tensor : net_def.tensors()) {
    MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
    VLOG(3) << "Tensor name: " << const_tensor.name()
@@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
            << ", shape: "
            << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
                                               const_tensor.dims().end()));
-    tensor_map_[const_tensor.name()] =
+    std::vector<index_t> dims;
-        serializer.Deserialize(const_tensor, type);
+    for (const index_t d : const_tensor.dims()) {
+      dims.push_back(d);
+    }
+    index_t
+      offset = (long long) const_tensor.data() - (long long) model_data_ptr;
+    std::unique_ptr<Tensor> tensor(
+      new Tensor(BufferSlice(tensor_buffer_.get(),
+                             offset,
+                             const_tensor.data_size()
+                               * GetEnumTypeSize(const_tensor.data_type())),
+                 const_tensor.data_type()));
+    tensor->Reshape(dims);
+    tensor_map_[const_tensor.name()] = std::move(tensor);
  }
  if (type == DeviceType::OPENCL) {
    CreateImageOutputTensor(net_def);
  }
@@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
    return;
  }
-  preallocated_allocator_ =
-    std::move(std::unique_ptr<PreallocatedPooledAllocator>(
-      new OpenCLPreallocatedPooledAllocator));
  DataType dtype = DataType::DT_INVALID;
  // We use the data type of the first op (with mem id, must be image),
@@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block: net_def.mem_arena().mem_block()) {
-    preallocated_allocator_->PreallocateImage(mem_block.mem_id(),
+    std::unique_ptr<BufferBase>
-                                              {mem_block.x(), mem_block.y()},
+      image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
-                                              dtype);
+    preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
  }
  VLOG(3) << "Preallocate image to tensors";
-  auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
  for (auto &op: net_def.op()) {
    if (op.has_mem_id()) {
-      CreateTensor(op.output(0), allocator, dtype);
+      std::unique_ptr<Tensor> tensor
-      tensor_map_[op.output(0)]->PreallocateImage(
+        (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
-        preallocated_allocator_->GetImage(op.mem_id()),
+      tensor_map_[op.output(0)] = std::move(tensor);
-        preallocated_allocator_->GetImageSize(op.mem_id()));
    }
  }
 }

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -15,26 +15,23 @@ class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
-  Workspace()
+  Workspace() {}
-    : preallocated_allocator_(nullptr) {}
  ~Workspace() {}
-  std::vector<std::string> Tensors() const;
+  Tensor *CreateTensor(const std::string &name,
+                       Allocator *alloc,
-  Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type);
+                       DataType type);
-  bool RemoveTensor(const std::string &name);
-  void RemoveUnsedTensor();
  inline bool HasTensor(const std::string &name) const {
-    return tensor_map_.count(name);
+    return tensor_map_.find(name) != tensor_map_.end();
  }
  const Tensor *GetTensor(const std::string &name) const;
  Tensor *GetTensor(const std::string &name);
+  std::vector<std::string> Tensors() const;
  void LoadModelTensor(const NetDef &net_def, DeviceType type);
 private:
@@ -42,9 +39,11 @@ class Workspace {
  TensorMap tensor_map_;
-  std::unique_ptr<PreallocatedPooledAllocator> preallocated_allocator_;
+  std::unique_ptr<BufferBase> tensor_buffer_;
+  PreallocatedPooledAllocator preallocated_allocator_;
-  DISABLE_COPY_AND_ASSIGN(Workspace);
+ DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 }  // namespace mace

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
        }
      }
    }
-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
        }
      }
    }
-    DoActivation(output_data, output_data, output->NumElements(), activation_,
+    DoActivation(output_data, output_data, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
        valid_h_stop, valid_w_start, valid_w_stop);
    output_ptr = output->mutable_data<T>();
-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase {
      }
    }
-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -14,7 +14,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                                             const BufferType type,
                                                             Tensor *image,
                                                             StatsFuture *future) {
-  MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
  std::vector<size_t> image_shape;
  if (!i2b_) {
    CalImage2DShape(buffer->shape(), type, image_shape);
@@ -25,9 +24,9 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
    } else {
      image->ResizeImage(buffer->shape(), image_shape);
    }
-    buffer->MarkUnused();
  } else {
-    image_shape = image->image_shape();
+    Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
+    image_shape = image_buf->image_shape();
    buffer->Resize(image->shape());
  }
@@ -78,7 +77,11 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                         built_options);
  uint32_t idx = 0;
-  b2f_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(buffer->buffer())));
+  b2f_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(buffer->buffer())));
+  if (!i2b_) {
+    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
+  }
  if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
  } else if(type == WEIGHT_HEIGHT) {

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>
 __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
+                                     __private const int input_offset,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
@@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  const int in_channel_idx = w % rounded_in_channel;
  const int h_idx = hw_idx / filter_w;
  const int w_idx = hw_idx % filter_w;
-  const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
+  const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
                           + in_channel_idx;
  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
@@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
 }
 __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+                                        __private const int input_offset,
                                        __private const int filter_w,
                                        __private const int in_channel,
                                        __private const int multiplier,
@@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
    const int h_idx = w / filter_w;
    const int w_idx = w % filter_w;
-    const int offset = mad24(mad24(h_idx, filter_w, w_idx),
+    const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx),
                             in_channel, in_channel_idx);
    const int size = in_channel - in_channel_idx;
@@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
    const int h_idx = hw_idx / filter_w;
    const int w_idx = hw_idx % filter_w;
-    const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx),
+    const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx),
                in_channel, in_channel_idx),
            multiplier, m);
    // TODO support multiplier > 1
@@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
 }
 __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                     __private const int input_offset,
                                     __private const int height,
                                     __private const int width,
                                     __private const int channels,
@@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  const int height_idx = h % height;
  const int width_idx = w % width;
  const int channel_idx = w / width * 4;
-  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
                           + channel_idx;
  const int size = channels - channel_idx;
@@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 }
 __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                  __private const int input_offset,
                                  __private const int count,
                                  __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
-  const int offset = w * 4;
-  const int size = count - offset;
+  const int offset = input_offset + w * 4;
+  const int size = count - w * 4;
  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
  if (size < 4) {
    switch(size) {
@@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
+                                            __private const int input_offset,
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
@@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
  const int height_idx = (h % height_blks) << 2;
  const int width_idx = w % width;
  const int channel_idx = w / width;
-  int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
      + channel_idx;
  int size = height - height_idx;
@@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
 __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                           __private const int input_offset,
                                           __private const int height,
                                           __private const int width,
                                           __private const int channels,
@@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
  const int height_idx = h % height;
  const int width_idx = (w % width) << 2;
  const int channel_idx = w / width;
-  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
      + channel_idx;
  int size = width - width_idx;
@@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
 // only support 3x3 now
 __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
+                                              __private const int input_offset,
                                              __private const int in_channels,
                                              __private const int height,
                                              __private const int width,
@@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
  const int out_channels = get_global_size(1);
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
-  const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
  const int length = min((in_channels - in_channel_idx) * 9, 36);
  DATA_TYPE in[36] = {0};
  DATA_TYPE4 tt;
@@ -390,7 +399,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
  tu3[3] = tu3[2];
  tu3[2] = tt - tu3[1] / 2;
  tu3[1] = tt + tu3[1] / 2;
  int2 coord = (int2)(w, h);
 #pragma unroll
  for (short i = 0; i < 4; ++i) {

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -65,13 +65,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
+  if (strides_[0] != strides_[1] ||
      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
-                 << " and input image: " << input->is_image()
                 << " is not implemented yet.";
    MACE_NOT_IMPLEMENTED;
  }

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -27,13 +27,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  if (kernel_.get() == nullptr) {
    MACE_CHECK(out_height > 0 && out_width > 0);
    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    if (input->is_image()) {
-      std::vector<size_t> output_image_shape;
+    std::vector<size_t> output_image_shape;
-      CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape,
-      output->ResizeImage(output_shape, output_image_shape);
+                    BufferType::IN_OUT_CHANNEL,
-    } else {
+                    output_image_shape);
-      output->Resize(output_shape);
+    output->ResizeImage(output_shape, output_image_shape);
-    }
    float height_scale =
        CalculateResizeScale(in_height, out_height, align_corners_);