Remove unused tensor of GPU.

f763da2b · liuqi · 691331bd · f763da2b · f763da2b · f763da2b
9 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
  return size;
 }

+uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
+  uint64_t size = 0;
+  device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  return size;
+}
+
 uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
  uint64_t size = 0;
  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -85,6 +85,7 @@ class OpenCLRuntime {

  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint64_t GetDeviceMaxWorkGroupSize();
+  uint64_t GetDeviceMaxMemAllocSize();
  uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
  bool IsNonUniformWorkgroupsSupported() const;

--- a/mace/core/runtime_failure_mock.cc
+++ b/mace/core/runtime_failure_mock.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cstdlib>
+#include <random>
 #include <string>

 #include "mace/core/runtime_failure_mock.h"
@@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() {
 }  // namespace

 bool ShouldMockRuntimeFailure() {
-  static unsigned int seed = time(NULL);
  static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv();
  if (mock_runtime_failure_ratio > 1e-6) {
-    float random_ratio = rand_r(&seed) / static_cast<float>(RAND_MAX);
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+    float random_ratio = dis(gen);
    if (random_ratio < mock_runtime_failure_ratio) {
      VLOG(0) << "Mock runtime failure.";
      return true;

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -105,18 +105,21 @@ class Tensor {
        dtype_(type),
        buffer_(nullptr),
        is_buffer_owner_(true),
+        unused_(false),
        name_("") {}

  Tensor(BufferBase *buffer, DataType dtype)
    : dtype_(dtype),
      buffer_(buffer),
      is_buffer_owner_(false),
+      unused_(false),
      name_("") {}

  Tensor(const BufferSlice &buffer_slice, DataType dtype)
      : dtype_(dtype),
        buffer_slice_(buffer_slice),
        is_buffer_owner_(false),
+        unused_(false),
        name_("") {
    buffer_ = &buffer_slice_;
  }
@@ -133,6 +136,8 @@ class Tensor {

  inline void SetDtype(DataType dtype) { dtype_ = dtype; }

+  inline bool unused() const { return unused_; }
+
  inline const std::vector<index_t> &shape() const { return shape_; }

  inline index_t dim_size() const { return shape_.size(); }
@@ -195,6 +200,10 @@ class Tensor {
    return static_cast<T *>(buffer_->raw_mutable_data());
  }

+  inline void MarkUnused() {
+    unused_ = true;
+  }
+
  inline void Clear() {
    MACE_CHECK_NOTNULL(buffer_);
    buffer_->Clear(raw_size());
@@ -362,6 +371,7 @@ class Tensor {
  BufferBase *buffer_;
  BufferSlice buffer_slice_;
  bool is_buffer_owner_;
+  bool unused_;
  std::string name_;

  MACE_DISABLE_COPY_AND_ASSIGN(Tensor);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -18,6 +18,9 @@
 #include <utility>

 #include "mace/core/arg_helper.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
 #include "mace/core/workspace.h"
 #include "mace/utils/timer.h"

@@ -85,6 +88,46 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
  VLOG(3) << "Model data size: " << model_data_size;

  if (model_data_size > 0) {
+#ifdef MACE_ENABLE_OPENCL
+    if (type == DeviceType::GPU &&
+        OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
+            static_cast<uint64_t>(model_data_size)) {
+      for (auto &const_tensor : net_def.tensors()) {
+        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
+        VLOG(3) << "Tensor name: " << const_tensor.name()
+                << ", data type: " << const_tensor.data_type() << ", shape: "
+                << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
+                                                   const_tensor.dims().end()));
+        std::vector<index_t> dims;
+        for (const index_t d : const_tensor.dims()) {
+          dims.push_back(d);
+        }
+
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(GetDeviceAllocator(type),
+                       const_tensor.data_type()));
+        tensor->Resize(dims);
+
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        MACE_CHECK(const_tensor.offset() + tensor->raw_size() <=
+            model_data_size,
+                   "buffer offset + length (",
+                   const_tensor.offset(),
+                   " + ",
+                   tensor->raw_size(),
+                   ") should <= ",
+                   model_data_size);
+        tensor->CopyBytes(model_data + const_tensor.offset(),
+                          const_tensor.data_size() *
+                              GetEnumTypeSize(const_tensor.data_type()));
+
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      }
+    } else {
+#else
+    {
+#endif
      if (type == DeviceType::CPU) {
        tensor_buffer_ = std::unique_ptr<Buffer>(
            new Buffer(GetDeviceAllocator(type),
@@ -99,8 +142,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                             0, model_data_size);
        tensor_buffer_->UnMap();
      }
-  }
-
      for (auto &const_tensor : net_def.tensors()) {
        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
        VLOG(3) << "Tensor name: " << const_tensor.name()
@@ -113,7 +154,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        }

        std::unique_ptr<Tensor> tensor(
-        new Tensor(BufferSlice(tensor_buffer_.get(), const_tensor.offset(),
+            new Tensor(BufferSlice(
+                tensor_buffer_.get(), const_tensor.offset(),
                const_tensor.data_size() *
                    GetEnumTypeSize(const_tensor.data_type())),
                       const_tensor.data_type()));
@@ -121,6 +163,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        tensor->Reshape(dims);
        tensor_map_[const_tensor.name()] = std::move(tensor);
      }
+    }
+  }

  if (type == DeviceType::CPU || type == DeviceType::GPU) {
    MaceStatus status = CreateOutputTensorBuffer(net_def, type);
@@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
  }
 }

+void Workspace::RemoveUnusedBuffer() {
+  auto iter = tensor_map_.begin();
+  auto end_iter = tensor_map_.end();
+  while (iter != end_iter) {
+    auto old_iter = iter++;
+    if (old_iter->second->unused()) {
+      tensor_map_.erase(old_iter);
+    }
+  }
+
+  tensor_buffer_.reset(nullptr);
+}
+
 }  // namespace mace
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -53,6 +53,8 @@ class Workspace {

  ScratchBuffer *GetScratchBuffer(DeviceType device_type);

+  void RemoveUnusedBuffer();
+
 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      DeviceType device_type);

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    };
  }

+  // Mark the buffer unused.
+  const_cast<Tensor *>(buffer)->MarkUnused();
+
  return MACE_SUCCESS;
 }


--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init(
 #ifdef MACE_ENABLE_HEXAGON
  }
 #endif
+  if (device_type_ == DeviceType::GPU) {
+    ws_->RemoveUnusedBuffer();
+  }
  return MaceStatus::MACE_SUCCESS;
 }

@@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init(
  }
  model_data_ = LoadModelData(model_data_file, model_data_size_);

-  Init(net_def, input_nodes, output_nodes, model_data_);
+  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));

  if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) {
    UnloadModelData(model_data_, model_data_size_);

--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -66,6 +66,7 @@ class KVStorageFactory {
 class __attribute__((visibility("default"))) FileStorageFactory
    : public KVStorageFactory {
 public:
+  // You have to make sure your APP have read and write permission of the path.
  explicit FileStorageFactory(const std::string &path);

  ~FileStorageFactory();
@@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory
  std::unique_ptr<Impl> impl_;
 };

-// Set KV store factory used as OpenCL cache. (Call Once)
+// Set Key-Value store factory. (Call Once)
+// Now KVStorage is used to store the built OpenCL binaries to file,
+// which could speed up the GPU initialization and first run.
+// If do not call this API, the initialization maybe slow for GPU.
 __attribute__((visibility("default")))
 void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);