Refactor tensor, accelerate gpu initialization time

bcbcbf7a · 李寅 · d548b461 · bcbcbf7a · bcbcbf7a · bcbcbf7a
24 changed file
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -26,17 +26,17 @@ class Allocator {
 public:
  Allocator() {}
  virtual ~Allocator() noexcept {}
-  virtual void *New(size_t nbytes) = 0;
+  virtual void *New(size_t nbytes) const = 0;
  virtual void *NewImage(const std::vector<size_t> &image_shape,
-                         const DataType dt) = 0;
-  virtual void Delete(void *data) = 0;
-  virtual void DeleteImage(void *data) = 0;
-  virtual void *Map(void *buffer, size_t nbytes) = 0;
+                         const DataType dt) const = 0;
+  virtual void Delete(void *data) const = 0;
+  virtual void DeleteImage(void *data) const = 0;
+  virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
  virtual void *MapImage(void *buffer,
                         const std::vector<size_t> &image_shape,
-                         std::vector<size_t> &mapped_image_pitch) = 0;
-  virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
-  virtual bool OnHost() = 0;
+                         std::vector<size_t> *mapped_image_pitch) const = 0;
+  virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
+  virtual bool OnHost() const = 0;

  template <typename T>
  T *New(size_t num_elements) {
@@ -52,7 +52,7 @@ class Allocator {
 class CPUAllocator : public Allocator {
 public:
  ~CPUAllocator() override {}
-  void *New(size_t nbytes) override {
+  void *New(size_t nbytes) const override {
    VLOG(3) << "Allocate CPU buffer: " << nbytes;
    void *data = nullptr;
 #ifdef __ANDROID__
@@ -67,27 +67,29 @@ class CPUAllocator : public Allocator {
  }

  void *NewImage(const std::vector<size_t> &shape,
-                 const DataType dt) override {
+                 const DataType dt) const override {
    LOG(FATAL) << "Allocate CPU image";
    return nullptr;
  }

-  void Delete(void *data) override {
+  void Delete(void *data) const override {
    VLOG(3) << "Free CPU buffer";
    free(data);
  }
-  void DeleteImage(void *data) override {
+  void DeleteImage(void *data) const override {
    LOG(FATAL) << "Free CPU image";
    free(data);
  };
-  void *Map(void *buffer, size_t nbytes) override { return buffer; }
+  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
+    return (char*)buffer + offset;
+  }
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> &mapped_image_pitch) override {
+                 std::vector<size_t> *mapped_image_pitch) const override {
    return buffer;
  }
-  void Unmap(void *buffer, void *mapper_ptr) override {}
-  bool OnHost() override { return true; }
+  void Unmap(void *buffer, void *mapper_ptr) const override {}
+  bool OnHost() const override { return true; }
 };

 std::map<int32_t, Allocator *> *gAllocatorRegistry();

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_BUFFER_H_
+#define MACE_CORE_BUFFER_H_
+
+#include "mace/core/types.h"
+#include "mace/core/allocator.h"
+#include <vector>
+
+namespace mace {
+
+class BufferBase {
+ public:
+  BufferBase() : size_(0) {}
+  BufferBase(index_t size) : size_(size) {}
+  virtual ~BufferBase() {}
+
+  virtual void *buffer() = 0;
+
+  virtual const void *raw_data() const = 0;
+
+  virtual void *raw_mutable_data() = 0;
+
+  virtual void *Map(index_t offset,
+                    index_t length,
+                    std::vector<size_t> *pitch) const = 0;
+
+  virtual void UnMap(void *mapped_ptr) const = 0;
+
+  virtual void Map(std::vector<size_t> *pitch) = 0;
+
+  virtual void UnMap() = 0;
+
+  virtual void Resize(index_t size) = 0;
+
+  virtual void Copy(void *src, index_t offset, index_t length) = 0;
+
+  virtual bool OnHost() const = 0;
+
+  virtual index_t offset() const {
+    return 0;
+  };
+
+  template<typename T>
+  const T *data() const {
+    return reinterpret_cast<const T *>(raw_data());
+  }
+
+  template<typename T>
+  T *mutable_data() {
+    return reinterpret_cast<T *>(raw_mutable_data());
+  }
+
+  index_t size() const {
+    return size_;
+  }
+
+ protected:
+  index_t size_;
+};
+
+class Buffer : public BufferBase {
+ public:
+  Buffer(Allocator *allocator)
+    : BufferBase(0),
+      allocator_(allocator),
+      buf_(nullptr),
+      mapped_buf_(nullptr),
+      is_data_owner_(true) {}
+
+  Buffer(Allocator *allocator, index_t size)
+    : BufferBase(size),
+      allocator_(allocator),
+      mapped_buf_(nullptr),
+      is_data_owner_(true) {
+    buf_ = allocator->New(size);
+  }
+
+  Buffer(Allocator *allocator, void *data, index_t size)
+    : BufferBase(size),
+      allocator_(allocator),
+      buf_(data),
+      mapped_buf_(nullptr),
+      is_data_owner_(false) {}
+
+  virtual ~Buffer() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (is_data_owner_ && buf_ != nullptr) {
+      allocator_->Delete(buf_);
+    }
+  }
+
+  void *buffer() {
+    MACE_CHECK_NOTNULL(buf_);
+    return buf_;
+  };
+
+  const void *raw_data() const {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buf_);
+      return buf_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+
+  void *raw_mutable_data() {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buf_);
+      return buf_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_CHECK_NOTNULL(buf_);
+    return allocator_->Map(buf_, offset, length);
+  }
+
+  void UnMap(void *mapped_ptr) const {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK_NOTNULL(mapped_ptr);
+    allocator_->Unmap(buf_, mapped_ptr);
+  }
+
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
+    mapped_buf_ = Map(0, size_, pitch);
+  };
+
+  void UnMap() {
+    UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  }
+
+  void Resize(index_t size) {
+    MACE_CHECK(is_data_owner_,
+               "data is not owned by this buffer, cannot resize");
+    if (size != size_) {
+      if (buf_ != nullptr) {
+        allocator_->Delete(buf_);
+      }
+      size_ = size;
+      buf_ = allocator_->New(size);
+    }
+  }
+
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    MACE_CHECK(length <= size_, "out of buffer");
+    memcpy(mapped_buf_, (char *) src + offset, length);
+  }
+
+  bool OnHost() const {
+    return allocator_->OnHost();
+  }
+
+ private:
+  Allocator *allocator_;
+  void *buf_;
+  void *mapped_buf_;
+  bool is_data_owner_;
+
+ DISABLE_COPY_AND_ASSIGN(Buffer);
+};
+
+class Image : public BufferBase {
+ public:
+  Image()
+    : BufferBase(0),
+      allocator_(GetDeviceAllocator(OPENCL)),
+      buf_(nullptr),
+      mapped_buf_(nullptr) {}
+
+  Image(std::vector<size_t> shape, DataType data_type)
+    : BufferBase(std::accumulate(shape.begin(), shape.end(),
+                                 1, std::multiplies<index_t>())
+                   * GetEnumTypeSize(data_type)),
+      allocator_(GetDeviceAllocator(OPENCL)),
+      mapped_buf_(nullptr) {
+    shape_ = shape;
+    data_type_ = data_type;
+    buf_ = allocator_->NewImage(shape, data_type);
+  }
+
+  virtual ~Image() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+    if (buf_ != nullptr) {
+      allocator_->DeleteImage(buf_);
+    }
+  }
+
+  void *buffer() {
+    MACE_CHECK_NOTNULL(buf_);
+    return buf_;
+  };
+
+  const void *raw_data() const {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    return mapped_buf_;
+  }
+
+  void *raw_mutable_data() {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    return mapped_buf_;
+  }
+
+  std::vector<size_t> image_shape() const {
+    return shape_;
+  }
+
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  void UnMap(void *mapped_ptr) const {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK_NOTNULL(mapped_ptr);
+    allocator_->Unmap(buf_, mapped_ptr);
+  }
+
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK_NOTNULL(buf_);
+    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
+    MACE_CHECK_NOTNULL(pitch);
+    mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
+  };
+
+  void UnMap() {
+    UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  };
+
+  void Resize(index_t size) {}
+
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  bool OnHost() const {
+    return allocator_->OnHost();
+  }
+
+ private:
+  Allocator *allocator_;
+  std::vector<size_t> shape_;
+  DataType data_type_;
+  void *buf_;
+  void *mapped_buf_;
+
+ DISABLE_COPY_AND_ASSIGN(Image);
+};
+
+class BufferSlice : public BufferBase {
+ public:
+  BufferSlice() {}
+  BufferSlice(BufferBase *buffer, index_t offset, index_t length)
+    : BufferBase(buffer->size()),
+      buffer_(buffer),
+      mapped_buf_(nullptr),
+      offset_(offset),
+      length_(length) {
+    MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
+    MACE_CHECK(offset + length <= size_,
+               "buffer slice offset + length (",
+               offset,
+               " + ",
+               length,
+               ") should <= ",
+               size_);
+  }
+  BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
+                                                      other.offset_,
+                                                      other.length_) {}
+
+  ~BufferSlice() {
+    if (mapped_buf_ != nullptr) {
+      UnMap();
+    }
+  }
+
+  void *buffer() {
+    return buffer_->buffer();
+  };
+
+  const void *raw_data() const {
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buffer_);
+      return (char *) buffer_->raw_data() + offset_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
+  }
+
+  void *raw_mutable_data() {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  void UnMap(void *mapped_ptr) const {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  void Map(std::vector<size_t> *pitch) {
+    MACE_CHECK_NOTNULL(buffer_);
+    MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
+    mapped_buf_ = buffer_->Map(offset_, length_, pitch);
+  };
+
+  void UnMap() {
+    MACE_CHECK_NOTNULL(mapped_buf_);
+    buffer_->UnMap(mapped_buf_);
+    mapped_buf_ = nullptr;
+  };
+
+  void Resize(index_t size) {
+  }
+
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  index_t offset() const {
+    return offset_;
+  }
+
+  bool OnHost() const {
+    return buffer_->OnHost();
+  }
+
+ private:
+  BufferBase *buffer_;
+  void *mapped_buf_;
+  index_t offset_;
+  index_t length_;
+};
+
+}
+
+#endif // MACE_CORE_BUFFER_H_
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
    if (!net->Run()) {
      LOG(FATAL) << "Net init run failed";
    }
-    ws_->RemoveUnsedTensor();
    net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
  }
 }

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
                 DeviceType type)
    : op_registry_(op_registry), name_(net_def->name()) {}

-SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
+SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                     const std::shared_ptr<const NetDef> net_def,
                     Workspace *ws,
                     DeviceType type,
                     const NetMode mode)
    :  NetBase(op_registry, net_def, ws, type),
      device_type_(type) {
-  MACE_LATENCY_LOGGER(1, "Constructing SimpleNet ", net_def->name());
+  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
    VLOG(3) << "Creating operator " << operator_def.name() << "("
@@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
  }
 }

-bool SimpleNet::Run(RunMetadata *run_metadata) {
+bool SerialNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
@@ -99,7 +99,7 @@ std::unique_ptr<NetBase> CreateNet(
    Workspace *ws,
    DeviceType type,
    const NetMode mode) {
-  std::unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode));
+  std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
  return net;
 }


--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -33,9 +33,9 @@ class NetBase {
  DISABLE_COPY_AND_ASSIGN(NetBase);
 };

-class SimpleNet : public NetBase {
+class SerialNet : public NetBase {
 public:
-  SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
+  SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
            const std::shared_ptr<const NetDef> net_def,
            Workspace *ws,
            DeviceType type,
@@ -47,7 +47,7 @@ class SimpleNet : public NetBase {
  std::vector<std::unique_ptr<OperatorBase> > operators_;
  DeviceType device_type_;

-  DISABLE_COPY_AND_ASSIGN(SimpleNet);
+  DISABLE_COPY_AND_ASSIGN(SerialNet);
 };

 std::unique_ptr<NetBase> CreateNet(

--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
@@ -5,6 +5,7 @@
 #ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
 #define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_

+#include <unordered_map>
 #include "mace/core/allocator.h"

 namespace mace {
@@ -13,17 +14,26 @@ class PreallocatedPooledAllocator {
 public:
  PreallocatedPooledAllocator() {}

-  virtual ~PreallocatedPooledAllocator() noexcept {}
+  ~PreallocatedPooledAllocator() noexcept {}

-  virtual void PreallocateImage(int mem_id,
-                                const std::vector<size_t> &image_shape,
-                                DataType data_type) = 0;
+  void SetBuffer(int mem_id, std::unique_ptr<BufferBase> &&buffer) {
+    buffers_[mem_id] = std::move(buffer);
+  }

-  virtual void *GetImage(int mem_id) = 0;
+  BufferBase *GetBuffer(int mem_id) {
+    if (buffers_.find(mem_id) != buffers_.end()) {
+      return buffers_[mem_id].get();
+    } else {
+      return nullptr;
+    }
+  }

-  virtual bool HasImage(int mem_id) = 0;
+  virtual bool HasBuffer(int mem_id) {
+    return buffers_.find(mem_id) != buffers_.end();
+  }

-  virtual std::vector<size_t> GetImageSize(int mem_id) = 0;
+ private:
+  std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
 };

 } // namespace mace

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -11,7 +11,6 @@
 #include "mace/core/runtime/hexagon/quantize.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/serializer.h"

 namespace mace {

@@ -37,7 +36,6 @@ class HexagonControlWrapper {
  void SetGraphMode(int mode);

 private:
-  // CAVEAT: Need offset as HVX library reserves some ids
  static constexpr int NODE_ID_OFFSET = 10000;

  inline uint32_t node_id(uint32_t nodeid) {
@@ -45,7 +43,6 @@ class HexagonControlWrapper {
  }

  int nn_id_;
-  Serializer serializer_;
  Quantizer quantizer_;

  std::vector<std::vector<index_t>> input_shapes_;

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
 OpenCLAllocator::OpenCLAllocator() {}

 OpenCLAllocator::~OpenCLAllocator() {}
-void *OpenCLAllocator::New(size_t nbytes) {
+void *OpenCLAllocator::New(size_t nbytes) const {
  VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
  cl_int error;
  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
@@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) {
 }

 void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
-                                const DataType dt) {
+                                const DataType dt) const {
  MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];

@@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  return cl_image;
 }

-void OpenCLAllocator::Delete(void *buffer) {
+void OpenCLAllocator::Delete(void *buffer) const {
  VLOG(3) << "Free OpenCL buffer";
  if (buffer != nullptr) {
    cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
@@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) {
  }
 }

-void OpenCLAllocator::DeleteImage(void *buffer) {
+void OpenCLAllocator::DeleteImage(void *buffer) const {
  VLOG(3) << "Free OpenCL image";
  if (buffer != nullptr) {
    cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
@@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
  }
 }

-void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
+void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Global()->command_queue();
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
-      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
                             nbytes, nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS);
  return mapped_ptr;
@@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
 // TODO : there is something wrong with half type.
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
-                                std::vector<size_t> &mapped_image_pitch) {
+                                std::vector<size_t> *mapped_image_pitch) const {
  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  auto cl_image = static_cast<cl::Image2D *>(buffer);
  std::array<size_t, 3> origin = {0, 0, 0};
  std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};

-  mapped_image_pitch.resize(2);
+  mapped_image_pitch->resize(2);
  cl_int error;
  void *mapped_ptr =
      OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                                                            origin, region,
-                                                            &mapped_image_pitch[0],
-                                                            &mapped_image_pitch[1],
+                                                            mapped_image_pitch->data(),
+                                                            mapped_image_pitch->data() + 1,
                                                            nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS) << error;

  return mapped_ptr;
 }

-void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
+void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Global()->command_queue();
  MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
                                         nullptr) == CL_SUCCESS);
 }

-bool OpenCLAllocator::OnHost() { return false; }
+bool OpenCLAllocator::OnHost() const { return false; }

 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator {

  ~OpenCLAllocator() override;

-  void *New(size_t nbytes) override;
+  void *New(size_t nbytes) const override;

  /*
   * Use Image2D with RGBA (128-bit) format to represent the image.
@@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator {
   * @ shape : [depth, ..., height, width ].
   */
  void *NewImage(const std::vector<size_t> &image_shape,
-                 const DataType dt) override;
+                 const DataType dt) const override;

-  void Delete(void *buffer) override;
+  void Delete(void *buffer) const override;

-  void DeleteImage(void *buffer) override;
+  void DeleteImage(void *buffer) const override;

-  void *Map(void *buffer, size_t nbytes) override;
+  void *Map(void *buffer, size_t offset, size_t nbytes) const override;

  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
-                 std::vector<size_t> &mapped_image_pitch) override;
+                 std::vector<size_t> *mapped_image_pitch) const override;

-  void Unmap(void *buffer, void *mapped_ptr) override;
+  void Unmap(void *buffer, void *mapped_ptr) const override;

-  bool OnHost() override;
+  bool OnHost() const override;
 };

 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
-//
-// Copyright (c) 2018 XiaoMi All rights reserved.
-//
-
-#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
-
-namespace mace {
-
-OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator()
-  : allocator(GetDeviceAllocator(DeviceType::OPENCL)) {
-}
-
-OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() {
-}
-
-void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
-                                                         const std::vector<
-                                                           size_t> &image_shape,
-                                                         DataType data_type) {
-  MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
-  VLOG(2) << "Preallocate OpenCL image: " << mem_id << " "
-          << image_shape[0] << ", " << image_shape[1];
-  images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-    allocator->NewImage(image_shape, data_type), [this](void *p) {
-      this->allocator->DeleteImage(p);
-    }));
-  image_shapes_[mem_id] = image_shape;
-}
-
-} // namespace mace
--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
-//
-// Copyright (c) 2018 XiaoMi All rights reserved.
-//
-
-#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
-#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
-
-#include "mace/core/preallocated_pooled_allocator.h"
-#include <unordered_map>
-
-namespace mace {
-
-class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator {
- public:
-  OpenCLPreallocatedPooledAllocator();
-
-  ~OpenCLPreallocatedPooledAllocator() override;
-
-  void PreallocateImage(int mem_id,
-                        const std::vector<size_t> &image_shape,
-                        DataType data_type) override;
-
-  inline void *GetImage(int mem_id) override {
-    MACE_CHECK(HasImage(mem_id), "image does not exist");
-    return images_[mem_id].get();
-  }
-
-  inline bool HasImage(int mem_id) override {
-    return images_.find(mem_id) != images_.end();
-  }
-
-  inline std::vector<size_t> GetImageSize(int mem_id) override {
-    return image_shapes_[mem_id];
-  }
-
- private:
-  std::unordered_map<int, std::unique_ptr<void, std::function<void(void *)>>>
-    images_;
-  std::unordered_map<int, std::vector<size_t>> image_shapes_;
-  Allocator *allocator;
-};
-
-} // namepsace mace
-
-#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#include "mace/core/serializer.h"
-
-namespace mace {
-
-std::unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
-                                                   const std::string &name) {
-  MACE_NOT_IMPLEMENTED;
-  return nullptr;
-}
-
-std::unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &const_tensor,
-                                                DeviceType type) {
-  std::unique_ptr<Tensor> tensor(
-      new Tensor(GetDeviceAllocator(type), const_tensor.data_type()));
-  std::vector<index_t> dims;
-  for (const index_t d : const_tensor.dims()) {
-    dims.push_back(d);
-  }
-  tensor->Resize(dims);
-
-  switch (const_tensor.data_type()) {
-    case DT_HALF:
-      tensor->Copy<half>(reinterpret_cast<const half *>(const_tensor.data()),
-                         const_tensor.data_size());
-      break;
-    case DT_FLOAT:
-      tensor->Copy<float>(reinterpret_cast<const float *>(const_tensor.data()),
-                          const_tensor.data_size());
-      break;
-    case DT_DOUBLE:
-      tensor->Copy<double>(
-          reinterpret_cast<const double *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT32:
-      tensor->Copy<int32_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT64:
-      tensor->Copy<int64_t>(
-          reinterpret_cast<const int64_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_UINT8:
-      tensor->Copy<uint8_t>(
-          reinterpret_cast<const uint8_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT16:
-      tensor->CopyWithCast<int32_t, uint16_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_INT8:
-      tensor->CopyWithCast<int32_t, int8_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_UINT16:
-      tensor->CopyWithCast<int32_t, int16_t>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    case DT_BOOL:
-      tensor->CopyWithCast<int32_t, bool>(
-          reinterpret_cast<const int32_t *>(const_tensor.data()),
-          const_tensor.data_size());
-      break;
-    default:
-      MACE_NOT_IMPLEMENTED;
-      break;
-  }
-
-  return tensor;
-}
-
-}  // namespace mace
--- a/mace/core/serializer.h
+++ b/mace/core/serializer.h
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#ifndef MACE_CORE_SERIALIZER_H_
-#define MACE_CORE_SERIALIZER_H_
-
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-
-class Serializer {
- public:
-  Serializer() {}
-  ~Serializer() {}
-
-  std::unique_ptr<ConstTensor> Serialize(const Tensor &tensor,
-                                         const std::string &name);
-
-  std::unique_ptr<Tensor> Deserialize(const ConstTensor &const_tensor,
-                                      DeviceType type);
-
-  DISABLE_COPY_AND_ASSIGN(Serializer);
-};
-
-}  // namespace mace
-
-#endif  // MACE_CORE_SERIALIZER_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -5,7 +5,7 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_

-#include "mace/core/allocator.h"
+#include "mace/core/buffer.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
@@ -46,7 +46,6 @@ namespace mace {
  CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
                     , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)

-
 namespace numerical_chars {
 inline std::ostream &operator<<(std::ostream &os, char c) {
  return std::is_signed<char>::value ? os << static_cast<int>(c)
@@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {

 class Tensor {
 public:
-  Tensor()
-      : alloc_(GetDeviceAllocator(DeviceType::CPU)),
-        size_(0),
-        dtype_(DT_FLOAT),
-        buffer_(nullptr),
-        data_(nullptr),
-        unused_(false),
-        is_image_(false){
-};
-
  Tensor(Allocator *alloc, DataType type)
-      : alloc_(alloc),
-        size_(0),
+    : allocator_(alloc),
      dtype_(type),
      buffer_(nullptr),
-        data_(nullptr),
-        unused_(false),
-        is_image_(false){
-  };
+      is_buffer_owner_(true) {};
+
+  Tensor(BufferBase *buffer, DataType dtype)
+    : dtype_(dtype),
+      buffer_(buffer),
+      is_buffer_owner_(false) {}
+
+  Tensor(const BufferSlice &buffer_slice, DataType dtype)
+    : dtype_(dtype),
+      buffer_slice_(buffer_slice),
+      is_buffer_owner_(false) {
+    buffer_ = &buffer_slice_;
+  }
+
+  Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}

  ~Tensor() {
+    if (is_buffer_owner_ && buffer_ != nullptr) {
+      delete buffer_;
+    }
  }

  inline DataType dtype() const { return dtype_; }
@@ -93,10 +95,6 @@ class Tensor {

  inline const std::vector<index_t> &shape() const { return shape_; }

-  inline const std::vector<size_t> &image_shape() const { return image_shape_; }
-
-  inline const bool is_image() const { return is_image_; }
-
  inline index_t dim_size() const { return shape_.size(); }

  inline index_t dim(unsigned int index) const {
@@ -105,170 +103,105 @@ class Tensor {
    return shape_[index];
  }

-  inline index_t size() const { return size_; }
-
-  inline index_t raw_size() const { return size_ * SizeOfType(); }
-
-  inline const bool unused() const { return unused_; }
-
-  inline int64_t NumElements() const {
+  inline index_t size() const {
    return std::accumulate(shape_.begin(), shape_.end(), 1,
                           std::multiplies<int64_t>());
  }

-  inline const bool OnHost() const { return alloc_->OnHost(); }
-
-  /*
-   * Map the device buffer as CPU buffer to access the data, unmap must be
-   * called later
-   */
-  inline void Map() const {
-    if (!OnHost()) {
-      MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
-      data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType());
-    }
+  inline index_t raw_size() const {
+    return size() * SizeOfType();
  }

-  inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
-    MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
-    data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch);
+  inline void *buffer() const {
+    MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr,
+               "buffer is null");
+    return buffer_->buffer();
  }

-  /*
-   *  Unmap the device buffer
-   */
-  inline void Unmap() const {
-    if (!OnHost()) {
-      MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
-      alloc_->Unmap(buffer_.get(), data_);
-      data_ = nullptr;
+  inline index_t buffer_offset() const {
+    return buffer_->offset();
  }
-  }
-
-  void *buffer() const { return buffer_.get(); }

  inline const void *raw_data() const {
-    void *data = MappedBuffer();
-    MACE_CHECK(data != nullptr || size_ == 0,
-               "The tensor is of non-zero shape, but its data is not allocated "
-               "or mapped yet.");
-    return data;
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return buffer_->raw_data();
  }

-  template <typename T>
+  template<typename T>
  inline const T *data() const {
-    return static_cast<const T *>(raw_data());
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return buffer_->data<T>();
  }

  inline void *raw_mutable_data() {
-    void *data = MappedBuffer();
-    MACE_CHECK(data != nullptr || size_ == 0,
-               "The tensor is of non-zero shape, but its data is not allocated "
-               "or mapped yet.");
-    return data;
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return buffer_->raw_mutable_data();
  }

-  template <typename T>
+  template<typename T>
  inline T *mutable_data() {
-    return static_cast<T *>(raw_mutable_data());
+    MACE_CHECK(buffer_ != nullptr, "buffer is null");
+    return static_cast<T *>(buffer_->raw_mutable_data());
  }

-  inline void Resize(const std::vector<index_t> &shape) {
-    MACE_CHECK(!is_image_ || buffer_ == nullptr,
-               "Resize is not for image, use ResizeImage instead.");
-    is_image_ = false;
+  inline void Reshape(const std::vector<index_t> &shape) {
    shape_ = shape;
-    index_t size = NumElements();
-    if (size_ != size) {
-      size_ = size;
-      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
-      CASES(dtype_,
-            (buffer_ =
-               std::move(std::unique_ptr<void, std::function<void(void *)>>(
-                 alloc_->New(size_ * sizeof(T)),
-                 [this](void *p) {
-                   this->alloc_->Delete(p);
-                 })
-               )));
-    }
+    MACE_CHECK(raw_size() <= buffer_->size());
  }

-  inline void ResizeImage(const std::vector<index_t> &shape,
-                          const std::vector<size_t> &image_shape) {
-    MACE_CHECK(is_image_ || buffer_ == nullptr,
-               "ResizeImage is not for buffer, use Resize instead.");
-    is_image_ = true;
+  inline void Resize(const std::vector<index_t> &shape) {
    shape_ = shape;
-    index_t size = NumElements();
-    if (size_ != size) {
-      size_ = size;
-      image_shape_ = image_shape;
-      if (!preallocated_image_shape_.empty()) {
-        MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0]
-                     && preallocated_image_shape_[1] >= image_shape[1],
-                   "image shape not large enough: preallocated ",
-                   preallocated_image_shape_[0],
-                   " ",
-                   preallocated_image_shape_[1],
-                   "apply for ",
-                   image_shape[0],
-                   " ",
-                   image_shape[1]);
+    if (buffer_ != nullptr) {
+      buffer_->Resize(raw_size());
    } else {
-        buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-          alloc_->NewImage(image_shape, dtype_),
-          [this](void *p) { this->alloc_->DeleteImage(p); }));
-        preallocated_image_shape_ = image_shape;
-      }
+      buffer_ = new Buffer(allocator_, raw_size());
+      is_buffer_owner_ = true;
    }
  }

  inline void ResizeLike(const Tensor &other) {
-    if (other.is_image()) {
-      ResizeImage(other.shape(), other.image_shape());
-    } else {
    Resize(other.shape());
  }
-  }

  inline void ResizeLike(const Tensor *other) {
-    if (other->is_image()) {
-      ResizeImage(other->shape(), other->image_shape());
-    } else {
    Resize(other->shape());
  }
+
+  inline void ResizeImage(const std::vector<index_t> &shape,
+                          const std::vector<size_t> &image_shape) {
+    shape_ = shape;
+    if (buffer_ == nullptr) {
+      buffer_ = new Image(image_shape, dtype_);
+      is_buffer_owner_ = true;
+    }
  }

-  inline void PreallocateImage(void *image,
-                               const std::vector<size_t>& image_shape) {
-    is_image_ = true;
-    buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
-      image, [](void *p) {
-        // tensor does not have ownership of preallocated memory
-      }));
-    preallocated_image_shape_ = image_shape;
+  inline void CopyBytes(const void *src, size_t size) {
+    MappingGuard guard(this);
+    memcpy(buffer_->raw_mutable_data(), src, size);
  }

-  template <typename T>
-  inline void Copy(const T *src, index_t size) {
-    MACE_CHECK(size == size_, "copy src and dst with different size.");
-    CopyBytes(static_cast<const void *>(src), sizeof(T) * size);
+  template<typename T>
+  inline void Copy(const T *src, index_t length) {
+    MACE_CHECK(length == size(), "copy src and dst with different size.");
+    CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
  }

-  template <typename SrcType, typename DstType>
-  inline void CopyWithCast(const SrcType *src, size_t size) {
-    MACE_CHECK(static_cast<index_t>(size) == size_,
-               "copy src and dst with different size.");
-    std::unique_ptr<DstType[]> buffer(new DstType[size]);
-    for (size_t i = 0; i < size; ++i) {
-      buffer[i] = static_cast<DstType>(src[i]);
+  inline void Copy(const Tensor &other) {
+    dtype_ = other.dtype_;
+    ResizeLike(other);
+    MappingGuard map_other(&other);
+    CopyBytes(other.raw_data(), other.size() * SizeOfType());
  }
-    CopyBytes(static_cast<const void *>(buffer.get()), sizeof(DstType) * size);
+
+  inline size_t SizeOfType() const {
+    size_t type_size = 0;
+    CASES(dtype_, type_size = sizeof(T));
+    return type_size;
  }

-  inline void CopyBytes(const void *src, size_t size) {
-    MappingGuard map_this(this);
-    memcpy(raw_mutable_data(), src, size);
+  inline BufferBase *UnderlyingBuffer() const {
+    return buffer_;
  }

  inline void DebugPrint() const {
@@ -281,8 +214,8 @@ class Tensor {
    os.str("");
    os.clear();
    MappingGuard guard(this);
-    for (int i = 0; i < size_; ++i) {
-      if ( i != 0 && i % shape_[3] == 0) {
+    for (int i = 0; i < size(); ++i) {
+      if (i != 0 && i % shape_[3] == 0) {
        os << "\n";
      }
      CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
@@ -291,37 +224,11 @@ class Tensor {
              << dim(2) << ", " << dim(3) << "], content:\n" << os.str();
  }

-  inline size_t SizeOfType() const {
-    size_t type_size = 0;
-    CASES(dtype_, type_size = sizeof(T));
-    return type_size;
-  }
-
-  inline void Copy(const Tensor &other) {
-    alloc_ = other.alloc_;
-    dtype_ = other.dtype_;
-    ResizeLike(other);
-    MappingGuard map_other(&other);
-    if (is_image_) {
-      LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
-    } else {
-      CopyBytes(other.raw_data(), size_ * SizeOfType());
-    }
-  }
-
-  inline void MarkUnused() {
-    this->unused_ = true;
-  }
-
  class MappingGuard {
   public:
    MappingGuard(const Tensor *tensor) : tensor_(tensor) {
      if (tensor_ != nullptr) {
-        if (tensor_->is_image()) {
-          tensor_->MapImage(mapped_image_pitch_);
-        } else {
-          tensor_->Map();
-        }
+        tensor_->buffer_->Map(&mapped_image_pitch_);
      }
    }

@@ -331,10 +238,12 @@ class Tensor {
    }

    ~MappingGuard() {
-      if (tensor_ != nullptr) tensor_->Unmap();
+      if (tensor_ != nullptr) tensor_->buffer_->UnMap();
    }

-    inline const std::vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
+    inline const std::vector<size_t> &mapped_image_pitch() const {
+      return mapped_image_pitch_;
+    }

   private:
    const Tensor *tensor_;
@@ -344,27 +253,12 @@ class Tensor {
  };

 private:
-  inline void *MappedBuffer() const {
-    if (OnHost()) {
-      return buffer_.get();
-    }
-    return data_;
-  }
-
-  Allocator *alloc_;
-  index_t size_;
+  Allocator *allocator_;
  DataType dtype_;
-  // Raw buffer, must be mapped as host accessable data before
-  // read or write
-  std::unique_ptr<void, std::function<void(void*)>> buffer_;
-  // Mapped buffer
-  mutable void *data_;
  std::vector<index_t> shape_;
-  // Image for opencl
-  bool unused_;
-  bool is_image_;
-  std::vector<size_t> image_shape_;
-  std::vector<size_t> preallocated_image_shape_;
+  BufferBase *buffer_;
+  BufferSlice buffer_slice_;
+  bool is_buffer_owner_;

 DISABLE_COPY_AND_ASSIGN(Tensor);
 };

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -6,21 +6,11 @@
 #include <vector>

 #include "mace/core/workspace.h"
-#include "mace/core/serializer.h"
 #include "mace/core/arg_helper.h"
-#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
 #include "mace/utils/timer.h"

 namespace mace {

-std::vector<std::string> Workspace::Tensors() const {
-  std::vector<std::string> names;
-  for (auto &entry : tensor_map_) {
-    names.push_back(entry.first);
-  }
-  return names;
-}
-
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
                                DataType type) {
@@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name,
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
-    tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
+    tensor_map_[name] =
+      std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
  }
  return GetTensor(name);
 }

-bool Workspace::RemoveTensor(const std::string &name) {
-  auto it = tensor_map_.find(name);
-  if (it != tensor_map_.end()) {
-    VLOG(3) << "Removing blob " << name << " from this workspace.";
-    tensor_map_.erase(it);
-    return true;
-  }
-  return false;
-}
-
 const Tensor *Workspace::GetTensor(const std::string &name) const {
  if (tensor_map_.count(name)) {
    return tensor_map_.at(name).get();
@@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
  return nullptr;
 }

-
-void Workspace::RemoveUnsedTensor() {
-  auto iter = tensor_map_.begin();
-  auto end_iter = tensor_map_.end();
-  while(iter != end_iter) {
-    auto old_iter = iter++;
-    if(old_iter->second->unused()) {
-      tensor_map_.erase(old_iter);
-    }
-  }
-}
-
 Tensor *Workspace::GetTensor(const std::string &name) {
  return const_cast<Tensor *>(
    static_cast<const Workspace *>(this)->GetTensor(name));
 }

+std::vector<std::string> Workspace::Tensors() const {
+  std::vector<std::string> names;
+  for (auto &entry : tensor_map_) {
+    names.push_back(entry.first);
+  }
+  return names;
+}
+
 void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  MACE_LATENCY_LOGGER(1, "Load model tensors");
-  Serializer serializer;
+  index_t model_data_size = 0;
+  unsigned char *model_data_ptr = nullptr;
+  for (auto &const_tensor : net_def.tensors()) {
+    if (model_data_ptr == nullptr
+      || reinterpret_cast<long long>(const_tensor.data())
+        < reinterpret_cast<long long>(model_data_ptr)) {
+      model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
+    }
+  }
+  for (auto &const_tensor : net_def.tensors()) {
+    model_data_size = std::max(model_data_size,
+                               static_cast<index_t>(
+                                 (reinterpret_cast<long long>(const_tensor.data())
+                                   - reinterpret_cast<long long>(model_data_ptr))
+                                   + const_tensor.data_size()
+                                     * GetEnumTypeSize(const_tensor.data_type())));
+  }
+  VLOG(3) << "Model data size: " << model_data_size;
+
+  if (type == DeviceType::CPU) {
+    tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
+      new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
+  } else {
+    tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
+      new Buffer(GetDeviceAllocator(type), model_data_size)));
+    tensor_buffer_->Map(nullptr);
+    tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
+    tensor_buffer_->UnMap();
+  }
+
  for (auto &const_tensor : net_def.tensors()) {
    MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
    VLOG(3) << "Tensor name: " << const_tensor.name()
@@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
            << ", shape: "
            << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
                                               const_tensor.dims().end()));
-    tensor_map_[const_tensor.name()] =
-        serializer.Deserialize(const_tensor, type);
+    std::vector<index_t> dims;
+    for (const index_t d : const_tensor.dims()) {
+      dims.push_back(d);
    }
+
+    index_t
+      offset = (long long) const_tensor.data() - (long long) model_data_ptr;
+    std::unique_ptr<Tensor> tensor(
+      new Tensor(BufferSlice(tensor_buffer_.get(),
+                             offset,
+                             const_tensor.data_size()
+                               * GetEnumTypeSize(const_tensor.data_type())),
+                 const_tensor.data_type()));
+
+    tensor->Reshape(dims);
+    tensor_map_[const_tensor.name()] = std::move(tensor);
+  }
+
  if (type == DeviceType::OPENCL) {
    CreateImageOutputTensor(net_def);
  }
@@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
    return;
  }
-  preallocated_allocator_ =
-    std::move(std::unique_ptr<PreallocatedPooledAllocator>(
-      new OpenCLPreallocatedPooledAllocator));

  DataType dtype = DataType::DT_INVALID;
  // We use the data type of the first op (with mem id, must be image),
@@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block: net_def.mem_arena().mem_block()) {
-    preallocated_allocator_->PreallocateImage(mem_block.mem_id(),
-                                              {mem_block.x(), mem_block.y()},
-                                              dtype);
+    std::unique_ptr<BufferBase>
+      image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
+    preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
  }
  VLOG(3) << "Preallocate image to tensors";
-  auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
  for (auto &op: net_def.op()) {
    if (op.has_mem_id()) {
-      CreateTensor(op.output(0), allocator, dtype);
-      tensor_map_[op.output(0)]->PreallocateImage(
-        preallocated_allocator_->GetImage(op.mem_id()),
-        preallocated_allocator_->GetImageSize(op.mem_id()));
+      std::unique_ptr<Tensor> tensor
+        (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
+      tensor_map_[op.output(0)] = std::move(tensor);
    }
  }
 }

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -15,26 +15,23 @@ class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;

-  Workspace()
-    : preallocated_allocator_(nullptr) {}
+  Workspace() {}
  ~Workspace() {}

-  std::vector<std::string> Tensors() const;
-
-  Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type);
-
-  bool RemoveTensor(const std::string &name);
-
-  void RemoveUnsedTensor();
+  Tensor *CreateTensor(const std::string &name,
+                       Allocator *alloc,
+                       DataType type);

  inline bool HasTensor(const std::string &name) const {
-    return tensor_map_.count(name);
+    return tensor_map_.find(name) != tensor_map_.end();
  }

  const Tensor *GetTensor(const std::string &name) const;

  Tensor *GetTensor(const std::string &name);

+  std::vector<std::string> Tensors() const;
+
  void LoadModelTensor(const NetDef &net_def, DeviceType type);

 private:
@@ -42,7 +39,9 @@ class Workspace {

  TensorMap tensor_map_;

-  std::unique_ptr<PreallocatedPooledAllocator> preallocated_allocator_;
+  std::unique_ptr<BufferBase> tensor_buffer_;
+
+  PreallocatedPooledAllocator preallocated_allocator_;

 DISABLE_COPY_AND_ASSIGN(Workspace);
 };

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
        }
      }
    }
-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
        }
      }
    }
-    DoActivation(output_data, output_data, output->NumElements(), activation_,
+    DoActivation(output_data, output_data, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
        valid_h_stop, valid_w_start, valid_w_stop);

    output_ptr = output->mutable_data<T>();
-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase {
      }
    }

-    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -14,7 +14,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                                             const BufferType type,
                                                             Tensor *image,
                                                             StatsFuture *future) {
-  MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
  std::vector<size_t> image_shape;
  if (!i2b_) {
    CalImage2DShape(buffer->shape(), type, image_shape);
@@ -25,9 +24,9 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
    } else {
      image->ResizeImage(buffer->shape(), image_shape);
    }
-    buffer->MarkUnused();
  } else {
-    image_shape = image->image_shape();
+    Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
+    image_shape = image_buf->image_shape();
    buffer->Resize(image->shape());
  }

@@ -78,7 +77,11 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                         built_options);

  uint32_t idx = 0;
-  b2f_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(buffer->buffer())));
+  b2f_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(buffer->buffer())));
+  if (!i2b_) {
+    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
+  }
  if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
  } else if(type == WEIGHT_HEIGHT) {

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>

 __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
+                                     __private const int input_offset,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
@@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  const int in_channel_idx = w % rounded_in_channel;
  const int h_idx = hw_idx / filter_w;
  const int w_idx = hw_idx % filter_w;
-  const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
+  const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
                           + in_channel_idx;

  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
@@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
 }

 __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+                                        __private const int input_offset,
                                        __private const int filter_w,
                                        __private const int in_channel,
                                        __private const int multiplier,
@@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
    const int h_idx = w / filter_w;
    const int w_idx = w % filter_w;

-    const int offset = mad24(mad24(h_idx, filter_w, w_idx),
+    const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx),
                             in_channel, in_channel_idx);

    const int size = in_channel - in_channel_idx;
@@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
    const int h_idx = hw_idx / filter_w;
    const int w_idx = hw_idx % filter_w;

-    const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx),
+    const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx),
                in_channel, in_channel_idx),
            multiplier, m);
    // TODO support multiplier > 1
@@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
 }

 __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                     __private const int input_offset,
                                     __private const int height,
                                     __private const int width,
                                     __private const int channels,
@@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  const int height_idx = h % height;
  const int width_idx = w % width;
  const int channel_idx = w / width * 4;
-  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
                           + channel_idx;

  const int size = channels - channel_idx;
@@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 }

 __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                  __private const int input_offset,
                                  __private const int count,
                                  __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
-  const int offset = w * 4;

-  const int size = count - offset;
+  const int offset = input_offset + w * 4;
+  const int size = count - w * 4;
+
+
  VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
  if (size < 4) {
    switch(size) {
@@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */


 __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
+                                            __private const int input_offset,
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
@@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
  const int height_idx = (h % height_blks) << 2;
  const int width_idx = w % width;
  const int channel_idx = w / width;
-  int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
      + channel_idx;

  int size = height - height_idx;
@@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc


 __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                           __private const int input_offset,
                                           __private const int height,
                                           __private const int width,
                                           __private const int channels,
@@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
  const int height_idx = h % height;
  const int width_idx = (w % width) << 2;
  const int channel_idx = w / width;
-  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
      + channel_idx;

  int size = width - width_idx;
@@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n

 // only support 3x3 now
 __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
+                                              __private const int input_offset,
                                              __private const int in_channels,
                                              __private const int height,
                                              __private const int width,
@@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
  const int out_channels = get_global_size(1);
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
-  const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
  const int length = min((in_channels - in_channel_idx) * 9, 36);
  DATA_TYPE in[36] = {0};
  DATA_TYPE4 tt;

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -65,13 +65,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
+  if (strides_[0] != strides_[1] ||
      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
-                 << " and input image: " << input->is_image()
                 << " is not implemented yet.";
    MACE_NOT_IMPLEMENTED;
  }

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -27,13 +27,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  if (kernel_.get() == nullptr) {
    MACE_CHECK(out_height > 0 && out_width > 0);
    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    if (input->is_image()) {
+
    std::vector<size_t> output_image_shape;
-      CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape,
+                    BufferType::IN_OUT_CHANNEL,
+                    output_image_shape);
    output->ResizeImage(output_shape, output_image_shape);
-    } else {
-      output->Resize(output_shape);
-    }

    float height_scale =
        CalculateResizeScale(in_height, out_height, align_corners_);