From f763da2b859200b5e6a3d9238cadfc6ea652db83 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Tue, 17 Jul 2018 12:43:21 +0800
Subject: [PATCH] Remove unused tensor of GPU.

---
 mace/core/runtime/opencl/opencl_runtime.cc |   6 +
 mace/core/runtime/opencl/opencl_runtime.h  |   1 +
 mace/core/runtime_failure_mock.cc          |   8 +-
 mace/core/tensor.h                         |  10 ++
 mace/core/workspace.cc                     | 121 +++++++++++++++------
 mace/core/workspace.h                      |   2 +
 mace/kernels/opencl/buffer_to_image.cc     |   3 +
 mace/libmace/mace.cc                       |   5 +-
 mace/public/mace_runtime.h                 |   6 +-
 9 files changed, 125 insertions(+), 37 deletions(-)
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 10d3088c..3e67ef52 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
   return size;
 }
 
+uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
+  uint64_t size = 0;
+  device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  return size;
+}
+
 uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
   uint64_t size = 0;
   kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 31d29328..931df6c4 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -85,6 +85,7 @@ class OpenCLRuntime {
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
+  uint64_t GetDeviceMaxMemAllocSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
   bool IsNonUniformWorkgroupsSupported() const;
diff --git a/mace/core/runtime_failure_mock.cc b/mace/core/runtime_failure_mock.cc
index b28f51fe..3b47855e 100644
--- a/mace/core/runtime_failure_mock.cc
+++ b/mace/core/runtime_failure_mock.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstdlib>
+#include <random>
 #include <string>
 
 #include "mace/core/runtime_failure_mock.h"
@@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() {
 }  // namespace
 
 bool ShouldMockRuntimeFailure() {
-  static unsigned int seed = time(NULL);
   static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv();
   if (mock_runtime_failure_ratio > 1e-6) {
-    float random_ratio = rand_r(&seed) / static_cast<float>(RAND_MAX);
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+    float random_ratio = dis(gen);
     if (random_ratio < mock_runtime_failure_ratio) {
       VLOG(0) << "Mock runtime failure.";
       return true;
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 6dd41a4a..a40e55c6 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -105,18 +105,21 @@ class Tensor {
         dtype_(type),
         buffer_(nullptr),
         is_buffer_owner_(true),
+        unused_(false),
         name_("") {}
 
   Tensor(BufferBase *buffer, DataType dtype)
     : dtype_(dtype),
       buffer_(buffer),
       is_buffer_owner_(false),
+      unused_(false),
       name_("") {}
 
   Tensor(const BufferSlice &buffer_slice, DataType dtype)
       : dtype_(dtype),
         buffer_slice_(buffer_slice),
         is_buffer_owner_(false),
+        unused_(false),
         name_("") {
     buffer_ = &buffer_slice_;
   }
@@ -133,6 +136,8 @@ class Tensor {
 
   inline void SetDtype(DataType dtype) { dtype_ = dtype; }
 
+  inline bool unused() const { return unused_; }
+
   inline const std::vector<index_t> &shape() const { return shape_; }
 
   inline index_t dim_size() const { return shape_.size(); }
@@ -195,6 +200,10 @@ class Tensor {
     return static_cast<T *>(buffer_->raw_mutable_data());
   }
 
+  inline void MarkUnused() {
+    unused_ = true;
+  }
+
   inline void Clear() {
     MACE_CHECK_NOTNULL(buffer_);
     buffer_->Clear(raw_size());
@@ -362,6 +371,7 @@ class Tensor {
   BufferBase *buffer_;
   BufferSlice buffer_slice_;
   bool is_buffer_owner_;
+  bool unused_;
   std::string name_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 3d03345d..cb6be4ec 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -18,6 +18,9 @@
 #include <utility>
 
 #include "mace/core/arg_helper.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
 #include "mace/core/workspace.h"
 #include "mace/utils/timer.h"
 
@@ -85,41 +88,82 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
   VLOG(3) << "Model data size: " << model_data_size;
 
   if (model_data_size > 0) {
-    if (type == DeviceType::CPU) {
-      tensor_buffer_ = std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(type),
-                     const_cast<unsigned char*>(model_data),
-                     model_data_size));
-    } else {
-      tensor_buffer_ = std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(type)));
-      MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
-      tensor_buffer_->Map(nullptr);
-      tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
-                           0, model_data_size);
-      tensor_buffer_->UnMap();
-    }
-  }
+#ifdef MACE_ENABLE_OPENCL
+    if (type == DeviceType::GPU &&
+        OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
+            static_cast<uint64_t>(model_data_size)) {
+      for (auto &const_tensor : net_def.tensors()) {
+        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
+        VLOG(3) << "Tensor name: " << const_tensor.name()
+                << ", data type: " << const_tensor.data_type() << ", shape: "
+                << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
+                                                   const_tensor.dims().end()));
+        std::vector<index_t> dims;
+        for (const index_t d : const_tensor.dims()) {
+          dims.push_back(d);
+        }
 
-  for (auto &const_tensor : net_def.tensors()) {
-    MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
-    VLOG(3) << "Tensor name: " << const_tensor.name()
-            << ", data type: " << const_tensor.data_type() << ", shape: "
-            << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
-                                               const_tensor.dims().end()));
-    std::vector<index_t> dims;
-    for (const index_t d : const_tensor.dims()) {
-      dims.push_back(d);
-    }
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(GetDeviceAllocator(type),
+                       const_tensor.data_type()));
+        tensor->Resize(dims);
 
-    std::unique_ptr<Tensor> tensor(
-        new Tensor(BufferSlice(tensor_buffer_.get(), const_tensor.offset(),
-                               const_tensor.data_size() *
-                                   GetEnumTypeSize(const_tensor.data_type())),
-                   const_tensor.data_type()));
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        MACE_CHECK(const_tensor.offset() + tensor->raw_size() <=
+            model_data_size,
+                   "buffer offset + length (",
+                   const_tensor.offset(),
+                   " + ",
+                   tensor->raw_size(),
+                   ") should <= ",
+                   model_data_size);
+        tensor->CopyBytes(model_data + const_tensor.offset(),
+                          const_tensor.data_size() *
+                              GetEnumTypeSize(const_tensor.data_type()));
 
-    tensor->Reshape(dims);
-    tensor_map_[const_tensor.name()] = std::move(tensor);
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      }
+    } else {
+#else
+    {
+#endif
+      if (type == DeviceType::CPU) {
+        tensor_buffer_ = std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(type),
+                       const_cast<unsigned char*>(model_data),
+                       model_data_size));
+      } else {
+        tensor_buffer_ = std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(type)));
+        MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
+        tensor_buffer_->Map(nullptr);
+        tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
+                             0, model_data_size);
+        tensor_buffer_->UnMap();
+      }
+      for (auto &const_tensor : net_def.tensors()) {
+        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
+        VLOG(3) << "Tensor name: " << const_tensor.name()
+                << ", data type: " << const_tensor.data_type() << ", shape: "
+                << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
+                                                   const_tensor.dims().end()));
+        std::vector<index_t> dims;
+        for (const index_t d : const_tensor.dims()) {
+          dims.push_back(d);
+        }
+
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(BufferSlice(
+                tensor_buffer_.get(), const_tensor.offset(),
+                const_tensor.data_size() *
+                    GetEnumTypeSize(const_tensor.data_type())),
+                       const_tensor.data_type()));
+
+        tensor->Reshape(dims);
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      }
+    }
   }
 
   if (type == DeviceType::CPU || type == DeviceType::GPU) {
@@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
   }
 }
 
+void Workspace::RemoveUnusedBuffer() {
+  auto iter = tensor_map_.begin();
+  auto end_iter = tensor_map_.end();
+  while (iter != end_iter) {
+    auto old_iter = iter++;
+    if (old_iter->second->unused()) {
+      tensor_map_.erase(old_iter);
+    }
+  }
+
+  tensor_buffer_.reset(nullptr);
+}
+
 }  // namespace mace
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 38e8777b..ec636128 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -53,6 +53,8 @@ class Workspace {
 
   ScratchBuffer *GetScratchBuffer(DeviceType device_type);
 
+  void RemoveUnusedBuffer();
+
  private:
   MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                       DeviceType device_type);
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 5efad285..b29f7e81 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     };
   }
 
+  // Mark the buffer unused.
+  const_cast<Tensor *>(buffer)->MarkUnused();
+
   return MACE_SUCCESS;
 }
 
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 93518f85..03731078 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init(
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
+  if (device_type_ == DeviceType::GPU) {
+    ws_->RemoveUnusedBuffer();
+  }
   return MaceStatus::MACE_SUCCESS;
 }
 
@@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init(
   }
   model_data_ = LoadModelData(model_data_file, model_data_size_);
 
-  Init(net_def, input_nodes, output_nodes, model_data_);
+  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));
 
   if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) {
     UnloadModelData(model_data_, model_data_size_);
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
index 807155bb..a6a628b6 100644
--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -66,6 +66,7 @@ class KVStorageFactory {
 class __attribute__((visibility("default"))) FileStorageFactory
     : public KVStorageFactory {
  public:
+  // You have to make sure your APP have read and write permission of the path.
   explicit FileStorageFactory(const std::string &path);
 
   ~FileStorageFactory();
@@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory
   std::unique_ptr<Impl> impl_;
 };
 
-// Set KV store factory used as OpenCL cache. (Call Once)
+// Set Key-Value store factory. (Call Once)
+// Now KVStorage is used to store the built OpenCL binaries to file,
+// which could speed up the GPU initialization and first run.
+// If do not call this API, the initialization maybe slow for GPU.
 __attribute__((visibility("default")))
 void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
 
-- 
GitLab