[Phi] Add ClearHolder when re-alloc on new place in DeviceContext (#39833)

* [Phi] Add ClearHolder when re-alloc on new place in DeviceContext * fix hostAlloc * foix inferRT unittest * remove dev_ctx ptr

[Phi] Add ClearHolder when re-alloc on new place in DeviceContext (#39833)
* [Phi] Add ClearHolder when re-alloc on new place in DeviceContext * fix hostAlloc * foix inferRT unittest * remove dev_ctx ptr
2753c16f · Aurelius84 · GitHub · 282e09dc · 2753c16f · 2753c16f
Showing with 36 addition and 10 deletion

paddle/phi/core/CMakeLists.txt paddle/phi/core/CMakeLists.txt +1 -1

paddle/phi/core/dense_tensor.cc paddle/phi/core/dense_tensor.cc +3 -3

paddle/phi/core/device_context.cc paddle/phi/core/device_context.cc +32 -6

未找到文件。
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -16,13 +16,13 @@ cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce)
 cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)

-cc_library(phi_device_context SRCS device_context.cc DEPS tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)

 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)

 cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)

--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -94,9 +94,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
                          bytes));
    bytes = requested_size;
  }
-  // TODO(paddle-dev): In case of the allocator of storage_ is different with
-  // the incoming allocator, we should re-alloc data using the incoming
-  // allocator.
+  // NOTE(paddle-dev): In case of the allocator of storage_ is different with
+  // the incoming allocator, we will re-alloc data using the incoming
+  // allocator. See DeviceContext.Alloc in core/device_context.cc.
  if (!holder_ || holder_->size() < bytes + meta_.offset) {
    meta_.offset = 0;
    VLOG(10) << "Allocate data with bytes: " << bytes;

--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -13,8 +13,9 @@
 // limitations under the License.

 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/tensor_base.h"
+#include "paddle/phi/core/selected_rows.h"

 namespace phi {
 using DataType = paddle::experimental::DataType;
@@ -72,6 +73,7 @@ struct DeviceContext::Impl {
  }

  void* Alloc(TensorBase* tensor,
+              const Place& place,
              DataType dtype = DataType::UNDEFINED,
              size_t requested_size = 0) const {
    PADDLE_ENFORCE_NOT_NULL(
@@ -81,6 +83,12 @@ struct DeviceContext::Impl {
    if (dtype == DataType::UNDEFINED) {
      dtype = tensor->dtype();
    }
+    // NOTE(paddle-dev): In case of tensor has already hold allocation and
+    // is going to allocate allocation on new place, we will clear its holder
+    // firstly and then re-alloc it.
+    if (tensor->initialized() && tensor->place() != place) {
+      ClearHolder(tensor);
+    }
    auto* allocator =
        tensor->numel() == 0 ? zero_allocator_ : device_allocator_;
    return tensor->AllocateFrom(
@@ -88,9 +96,11 @@ struct DeviceContext::Impl {
  }

  template <typename T>
-  T* Alloc(TensorBase* tensor, size_t requested_size = 0) const {
+  T* Alloc(TensorBase* tensor,
+           const Place& place,
+           size_t requested_size = 0) const {
    DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
-    return static_cast<T*>(Alloc(tensor, dtype, requested_size));
+    return static_cast<T*>(Alloc(tensor, place, dtype, requested_size));
  }

  void* HostAlloc(TensorBase* tensor,
@@ -103,6 +113,9 @@ struct DeviceContext::Impl {
    if (dtype == DataType::UNDEFINED) {
      dtype = tensor->dtype();
    }
+    if (tensor->initialized() && tensor->place() != CPUPlace()) {
+      ClearHolder(tensor);
+    }
    auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
    return tensor->AllocateFrom(
        const_cast<Allocator*>(allocator), dtype, requested_size);
@@ -147,6 +160,19 @@ struct DeviceContext::Impl {
  }

 private:
+  void ClearHolder(TensorBase* tensor) const {
+    if (!tensor->initialized()) return;
+
+    if (DenseTensor::classof(tensor)) {
+      static_cast<DenseTensor*>(tensor)->clear();
+    } else if (SelectedRows::classof(tensor)) {
+      static_cast<SelectedRows*>(tensor)->mutable_value()->clear();
+    } else {
+      PADDLE_THROW(errors::Unimplemented(
+          "Only support DenseTensor and SelectedRows now."));
+    }
+  }
+
  const Allocator* device_allocator_{nullptr};
  const Allocator* host_allocator_{nullptr};
  const Allocator* zero_allocator_{nullptr};
@@ -168,7 +194,7 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
  impl_ = std::move(other.impl_);
 }

-DeviceContext& DeviceContext::operator=(DeviceContext&&) = default;
+DeviceContext& DeviceContext::operator=(DeviceContext&& other) = default;

 DeviceContext::~DeviceContext() = default;

@@ -199,12 +225,12 @@ const Allocator& DeviceContext::GetZeroAllocator() const {
 void* DeviceContext::Alloc(TensorBase* tensor,
                           DataType dtype,
                           size_t requested_size) const {
-  return impl_->Alloc(tensor, dtype, requested_size);
+  return impl_->Alloc(tensor, GetPlace(), dtype, requested_size);
 }

 template <typename T>
 T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const {
-  return impl_->Alloc<T>(tensor, requested_size);
+  return impl_->Alloc<T>(tensor, GetPlace(), requested_size);
 }

 void* DeviceContext::HostAlloc(TensorBase* tensor,