Seperate physical memory pool with logical tensor

ed7a7f53 · 李寅 · 40d6571f · ed7a7f53 · ed7a7f53 · ed7a7f53
6 changed file
--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
+#define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
+
+#include "mace/core/allocator.h"
+
+namespace mace {
+
+class PreallocatedPooledAllocator {
+ public:
+  PreallocatedPooledAllocator() {}
+
+  virtual ~PreallocatedPooledAllocator() noexcept {}
+
+  virtual void PreallocateImage(int mem_id,
+                                const std::vector<size_t> &image_shape,
+                                DataType data_type) = 0;
+
+  virtual void *GetImage(int mem_id) = 0;
+
+  virtual bool HasImage(int mem_id) = 0;
+
+  virtual std::vector<size_t> GetImageSize(int mem_id) = 0;
+};
+
+} // namespace mace
+
+#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+
+#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
+
+namespace mace {
+
+OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator()
+  : allocator(GetDeviceAllocator(DeviceType::OPENCL)) {
+}
+
+OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() {
+}
+
+void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
+                                                         const std::vector<
+                                                           size_t> &image_shape,
+                                                         DataType data_type) {
+  MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
+  VLOG(3) << "Preallocate OpenCL image: " << mem_id << " "
+          << image_shape[0] << ", " << image_shape[1];
+  images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
+    allocator->NewImage(image_shape, data_type), [this](void *p) {
+      this->allocator->DeleteImage(p);
+    }));
+  image_shapes_[mem_id] = image_shape;
+}
+
+} // namespace mace
\ No newline at end of file
--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
+#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
+
+#include "mace/core/preallocated_pooled_allocator.h"
+#include <unordered_map>
+
+namespace mace {
+
+class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator {
+ public:
+  OpenCLPreallocatedPooledAllocator();
+
+  ~OpenCLPreallocatedPooledAllocator() override;
+
+  void PreallocateImage(int mem_id,
+                        const std::vector<size_t> &image_shape,
+                        DataType data_type) override;
+
+  inline void *GetImage(int mem_id) override {
+    MACE_CHECK(HasImage(mem_id), "image does not exist");
+    return images_[mem_id].get();
+  }
+
+  inline bool HasImage(int mem_id) override {
+    return images_.find(mem_id) != images_.end();
+  }
+
+  inline std::vector<size_t> GetImageSize(int mem_id) override {
+    return image_shapes_[mem_id];
+  }
+
+ private:
+  std::unordered_map<int, std::unique_ptr<void, std::function<void(void *)>>>
+    images_;
+  std::unordered_map<int, std::vector<size_t>> image_shapes_;
+  Allocator *allocator;
+};
+
+} // namepsace mace
+
+#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -10,6 +10,7 @@
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
 #include "mace/core/public/mace.h"
+#include "preallocated_pooled_allocator.h"

 namespace mace {

@@ -71,7 +72,8 @@ class Tensor {
        buffer_(nullptr),
        data_(nullptr),
        unused_(false),
-        is_image_(false){};
+        is_image_(false){
+};

  Tensor(Allocator *alloc, DataType type)
      : alloc_(alloc),
@@ -80,18 +82,10 @@ class Tensor {
        buffer_(nullptr),
        data_(nullptr),
        unused_(false),
-        is_image_(false){};
+        is_image_(false){
+  };

  ~Tensor() {
-    MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy");
-    if (buffer_ != nullptr) {
-      MACE_CHECK_NOTNULL(alloc_);
-      if (is_image_) {
-        alloc_->DeleteImage(buffer_);
-      } else {
-        alloc_->Delete(buffer_);
-      }
-    }
  }

  inline DataType dtype() const { return dtype_; }
@@ -132,13 +126,13 @@ class Tensor {
  inline void Map() const {
    if (!OnHost()) {
      MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
-      data_ = alloc_->Map(buffer_, size_ * SizeOfType());
+      data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType());
    }
  }

  inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
    MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
-    data_ = alloc_->MapImage(buffer_, image_shape_, mapped_image_pitch);
+    data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch);
  }

  /*
@@ -147,12 +141,12 @@ class Tensor {
  inline void Unmap() const {
    if (!OnHost()) {
      MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
-      alloc_->Unmap(buffer_, data_);
+      alloc_->Unmap(buffer_.get(), data_);
      data_ = nullptr;
    }
  }

-  void *buffer() const { return buffer_; }
+  void *buffer() const { return buffer_.get(); }

  inline const void *raw_data() const {
    void *data = MappedBuffer();
@@ -181,42 +175,51 @@ class Tensor {
  }

  inline void Resize(const vector<index_t> &shape) {
+    MACE_CHECK(!is_image_ || buffer_ == nullptr,
+               "Resize is not for image, use ResizeImage instead.");
+    is_image_ = false;
    shape_ = shape;
    index_t size = NumElements();
-    if (size_ != size || is_image_) {
+    if (size_ != size) {
      size_ = size;
      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
-      if (is_image_) {
-        alloc_->DeleteImage(buffer_);
-      } else {
-        alloc_->Delete(buffer_);
-      }
-      is_image_ = false;
-      CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T)));
+      CASES(dtype_,
+            (buffer_ =
+               std::move(std::unique_ptr<void, std::function<void(void *)>>(
+                 alloc_->New(size_ * sizeof(T)),
+                 [this](void *p) {
+                   this->alloc_->Delete(p);
+                 })
+               )));
    }
  }

  inline void ResizeImage(const vector<index_t> &shape,
                          const std::vector<size_t> &image_shape) {
+    MACE_CHECK(is_image_ || buffer_ == nullptr,
+               "ResizeImage is not for buffer, use Resize instead.");
+    is_image_ = true;
    shape_ = shape;
    index_t size = NumElements();
-    if (size_ != size || !is_image_) {
+    if (size_ != size) {
      size_ = size;
-      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
-
-      if (is_image_ && !image_shape_.empty()) {
-        MACE_ASSERT(image_shape_.size() == 2
-                        && image_shape_[0] >= image_shape[0]
-                        || image_shape_[1] >= image_shape[1],
-                    "image shape not large enough");
-      }
-      if (!is_image_ && buffer_ != nullptr) {
-        alloc_->Delete(buffer_);
-      }
-      is_image_ = true;
-      if (image_shape_.empty()) {
      image_shape_ = image_shape;
-        buffer_ = alloc_->NewImage(image_shape, dtype_);
+      if (!preallocated_image_shape_.empty()) {
+        MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0]
+                     && preallocated_image_shape_[1] >= image_shape[1],
+                   "image shape not large enough: preallocated ",
+                   preallocated_image_shape_[0],
+                   " ",
+                   preallocated_image_shape_[1],
+                   "apply for ",
+                   image_shape[0],
+                   " ",
+                   image_shape[1]);
+      } else {
+        buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
+          alloc_->NewImage(image_shape, dtype_),
+          [this](void *p) { this->alloc_->DeleteImage(p); }));
+        preallocated_image_shape_ = image_shape;
      }
    }
  }
@@ -237,15 +240,14 @@ class Tensor {
    }
  }

-  inline void AllocateImageMemory(const std::vector<size_t> &image_shape) {
+  inline void PreallocateImage(void *image,
+                               const std::vector<size_t>& image_shape) {
    is_image_ = true;
-    if (image_shape_ != image_shape) {
-      if (buffer_ != nullptr) {
-        alloc_->DeleteImage(buffer_);
-      }
-      image_shape_ = image_shape;
-      buffer_ = alloc_->NewImage(image_shape, dtype_);
-    }
+    buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
+      image, [](void *p) {
+        // tensor does not have ownership of preallocated memory
+      }));
+    preallocated_image_shape_ = image_shape;
  }

  template <typename T>
@@ -273,7 +275,7 @@ class Tensor {
  inline void DebugPrint() const {
    using namespace numerical_chars;
    std::stringstream os;
-    for (int i : shape_) {
+    for (index_t i : shape_) {
      os << i << ", ";
    }

@@ -336,7 +338,7 @@ class Tensor {
 private:
  inline void *MappedBuffer() const {
    if (OnHost()) {
-      return buffer_;
+      return buffer_.get();
    }
    return data_;
  }
@@ -346,7 +348,7 @@ class Tensor {
  DataType dtype_;
  // Raw buffer, must be mapped as host accessable data before
  // read or write
-  void *buffer_;
+  std::unique_ptr<void, std::function<void(void*)>> buffer_;
  // Mapped buffer
  mutable void *data_;
  vector<index_t> shape_;
@@ -354,6 +356,7 @@ class Tensor {
  bool unused_;
  bool is_image_;
  std::vector<size_t> image_shape_;
+  std::vector<size_t> preallocated_image_shape_;

  DISABLE_COPY_AND_ASSIGN(Tensor);
 };

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -5,6 +5,7 @@
 #include "mace/core/workspace.h"
 #include "mace/core/serializer.h"
 #include "mace/core/arg_helper.h"
+#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"

 namespace mace {

@@ -23,7 +24,7 @@ Tensor *Workspace::CreateTensor(const string &name,
    VLOG(1) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(1) << "Creating Tensor " << name;
-    tensor_map_[name] = unique_ptr<Tensor>(new Tensor(alloc, type));
+    tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
  }
  return GetTensor(name);
 }
@@ -84,23 +85,43 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
    return;
  }
-  std::map<std::string, std::shared_ptr<Tensor>> mem_tensor_map;
-  const DataType dtype = static_cast<DataType>(
+  preallocated_allocator_ =
+    std::move(std::unique_ptr<PreallocatedPooledAllocator>(
+      new OpenCLPreallocatedPooledAllocator));
+
+  DataType dtype = DataType::DT_INVALID;
+  // We use the data type of the first op (with mem id, must be image),
+  // as GPU have consistent data type for each layer for now.
+  // As DSP may have different data output type for each op,
+  // we stick to the same concept.
+  for (auto &op: net_def.op()) {
+    if (op.has_mem_id()) {
+      const DataType op_dtype = static_cast<DataType>(
        ArgumentHelper::GetSingleArgument<OperatorDef, int>(
-          net_def.op(0),
+          op,
          "T",
          static_cast<int>(DT_FLOAT)));
+      if (op_dtype != DataType::DT_INVALID) {
+        dtype = op_dtype;
+        // find first valid data type, break
+        break;
+      }
+    }
+  }
+  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block: net_def.mem_arena().mem_block()) {
-    string mem_block_name = MemBlockName(mem_block.mem_id());
-    mem_tensor_map[mem_block_name].reset(new Tensor(
-        GetDeviceAllocator(DeviceType::OPENCL),
-        dtype));
-    mem_tensor_map[mem_block_name]->AllocateImageMemory({mem_block.x(),
-                                                         mem_block.y()});
+    preallocated_allocator_->PreallocateImage(mem_block.mem_id(),
+                                              {mem_block.x(), mem_block.y()},
+                                              dtype);
  }
+  VLOG(1) << "Preallocate image to tensors";
+  auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
  for (auto &op: net_def.op()) {
    if (op.has_mem_id()) {
-      tensor_map_[op.output(0)] = mem_tensor_map[MemBlockName(op.mem_id())];
+      CreateTensor(op.output(0), allocator, dtype);
+      tensor_map_[op.output(0)]->PreallocateImage(
+        preallocated_allocator_->GetImage(op.mem_id()),
+        preallocated_allocator_->GetImageSize(op.mem_id()));
    }
  }
 }

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -8,14 +8,17 @@
 #include "mace/core/common.h"
 #include "mace/core/tensor.h"
 #include "mace/core/public/mace.h"
+#include "mace/core/preallocated_pooled_allocator.h"

 namespace mace {

 class Workspace {
 public:
-  typedef map<string, std::shared_ptr<Tensor>> TensorMap;
+  typedef map<string, std::unique_ptr<Tensor>> TensorMap;

-  Workspace() {}
+  Workspace()
+    : preallocated_allocator_(nullptr) {}
+  ~Workspace() {}

  vector<string> Tensors() const;

@@ -35,15 +38,13 @@ class Workspace {

  void LoadModelTensor(const NetDef &net_def, DeviceType type);

-  inline std::string MemBlockName(int mem_id) const {
-	  return internal::MakeString("mem_block_", mem_id);
-  };
-
 private:
  void CreateImageOutputTensor(const NetDef &net_def);

  TensorMap tensor_map_;

+  std::unique_ptr<PreallocatedPooledAllocator> preallocated_allocator_;
+
  DISABLE_COPY_AND_ASSIGN(Workspace);
 };