From de74ee378a7e38348add50fa9afd0606aaca9d3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 17 Mar 2020 15:51:59 +0800
Subject: [PATCH] feature: buffer sharing, test=develop (#3161)

* feature: buffer sharing, test=develop

* fix the warning of buffer.h, test=develop

* test cuda only if necessary, test=develop
---
 lite/core/lite_tensor_test.cc | 42 ++++++++++++++++++++++++++++++-----
 lite/core/memory.h            | 12 +++++++++-
 lite/core/tensor.cc           | 15 +++++++++++++
 lite/core/tensor.h            | 10 ++++++++-
 4 files changed, 71 insertions(+), 8 deletions(-)
diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc
index d667a9f885..500dae3e28 100644
--- a/lite/core/lite_tensor_test.cc
+++ b/lite/core/lite_tensor_test.cc
@@ -13,19 +13,49 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <cstring>
 #include "lite/core/tensor.h"
 
 namespace paddle {
 namespace lite {
 
-TEST(tensor, test) {
-  TensorLite tensor;
-  DDimLite ddim({1, 8});
-  tensor.Resize(ddim);
+template <typename Dtype, TargetType Target>
+void test_shared_memory_tensor() {
+  const std::vector<Dtype> data({0, 1, 2, 3});
+  const std::vector<int64_t> shape({2, 2});
+  const size_t size = data.size() * sizeof(Dtype);
+  TensorLite init_tensor;
+  init_tensor.Assign<Dtype, DDim, Target>(data.data(),
+                                          static_cast<DDim>(shape));
+  Dtype* init_raw_data = init_tensor.mutable_data<Dtype>();
 
-  for (int i = 0; i < 8; i++) {
-    tensor.mutable_data<int>()[i] = i;
+  TensorLite shared_tensor(
+      std::make_shared<Buffer>(Buffer(init_raw_data, Target, size)));
+  Buffer host_buffer;
+  host_buffer.ResetLazy(TargetType::kHost, size);
+  if (Target == TargetType::kHost) {
+    CopySync<Target>(
+        host_buffer.data(), init_raw_data, size, IoDirection::HtoH);
+  } else {
+    CopySync<Target>(
+        host_buffer.data(), init_raw_data, size, IoDirection::DtoH);
   }
+  EXPECT_EQ(std::memcmp(host_buffer.data(), data.data(), size), 0);
+
+  shared_tensor.Resize({1, 5});
+  ASSERT_DEATH(shared_tensor.mutable_data<Dtype>(), "");
+}
+
+TEST(tensor, shared_memory) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  test_shared_memory_tensor<float, TargetType::kHost>();
+  test_shared_memory_tensor<int64_t, TargetType::kHost>();
+  test_shared_memory_tensor<int8_t, TargetType::kHost>();
+#ifdef LITE_WITH_CUDA
+  test_shared_memory_tensor<float, TargetType::kCUDA>();
+  test_shared_memory_tensor<int64_t, TargetType::kCUDA>();
+  test_shared_memory_tensor<int8_t, TargetType::kCUDA>();
+#endif
 }
 
 }  // namespace lite
diff --git a/lite/core/memory.h b/lite/core/memory.h
index 051d47bdde..71b475078c 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "lite/api/paddle_place.h"
 #include "lite/core/target_wrapper.h"
+#include "lite/utils/logging.h"
 #include "lite/utils/macros.h"
 
 #ifdef LITE_WITH_OPENCL
@@ -81,6 +82,9 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
 #endif
+    default:
+      LOG(FATAL)
+          << "The copy function of this target has not been implemented yet.";
   }
 }
 
@@ -89,13 +93,17 @@ class Buffer {
  public:
   Buffer() = default;
   Buffer(TargetType target, size_t size) : space_(size), target_(target) {}
+  Buffer(void* data, TargetType target, size_t size)
+      : space_(size), data_(data), own_data_(false), target_(target) {}
 
   void* data() const { return data_; }
   TargetType target() const { return target_; }
   size_t space() const { return space_; }
+  bool own_data() const { return own_data_; }
 
   void ResetLazy(TargetType target, size_t size) {
     if (target != target_ || space_ < size) {
+      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetMalloc(target, size);
       target_ = target;
@@ -115,6 +123,7 @@ class Buffer {
                   4;  // 4 for RGBA, un-used for opencl Image2D
     if (target != target_ || cl_image2d_width_ < img_w ||
         cl_image2d_height_ < img_h) {
+      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
       target_ = target;
@@ -126,7 +135,7 @@ class Buffer {
 #endif
 
   void Free() {
-    if (space_ > 0) {
+    if (space_ > 0 && own_data_) {
       TargetFree(target_, data_);
     }
     data_ = nullptr;
@@ -149,6 +158,7 @@ class Buffer {
   size_t cl_image2d_width_{0};   // only used for OpenCL Image2D
   size_t cl_image2d_height_{0};  // only used for OpenCL Image2D
   void* data_{nullptr};
+  bool own_data_{true};
   TargetType target_{TargetType::kHost};
 };
 
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 7664633077..ecb9935dfd 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -98,6 +98,21 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
   return mutable_data(memory_size);
 }
 
+void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
+                             size_t memory_size) {
+  CHECK_EQ(offset_, 0)
+      << "Only the offset is supported to zero when the Buffer is reset.";
+  if (buffer_) {
+    CHECK_LE(memory_size_, buffer->space())
+        << "The space of buffer is not enough to store the tensor.";
+    CHECK_LE(memory_size, buffer->space())
+        << "The buffer is smaller than the specified minimum size.";
+  }
+  buffer_ = buffer;
+  memory_size_ = memory_size;
+  target_ = buffer->target();
+}
+
 #ifdef LITE_WITH_OPENCL
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
index 6e2e771be9..2209e524f4 100644
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -102,9 +102,10 @@ using LoD = std::vector<std::vector<uint64_t>>;
 class TensorLite {
  public:
   TensorLite() : buffer_(std::make_shared<Buffer>()) {}
+  explicit TensorLite(std::shared_ptr<Buffer> buffer) : buffer_(buffer) {}
 
   template <typename DType, typename DimT, TargetType Target>
-  void Assign(DType *data, const DimT &dim) {
+  void Assign(const DType *data, const DimT &dim) {
     Resize(dim);
     auto *dst = mutable_data<DType, void>(Target);
     CopySync<Target>(
@@ -178,6 +179,11 @@ class TensorLite {
         (static_cast<char *>(buffer_->data()) + offset_));
   }
 
+  void *raw_data() {
+    return static_cast<char *>(
+        (static_cast<char *>(buffer_->data()) + offset_));
+  }
+
   void clear() {
     buffer_->Free();
     offset_ = 0;
@@ -195,6 +201,8 @@ class TensorLite {
 
   void CopyDataFrom(const TensorLite &other);
 
+  void ResetBuffer(std::shared_ptr<Buffer> buffer, size_t memory_size);
+
   TargetType target() const { return target_; }
 
   template <typename T>
-- 
GitLab