splits allocation for pten, test=develop (#38853)

277cf900 · 石晓伟 · GitHub · 0efcae86 · 277cf900 · 277cf900
65 changed file
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -410,8 +410,8 @@ class ExecutionContext {
    auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
    auto& deleter = tmp_allocation_ptr.get_deleter();
    auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
-        allocation_ptr, deleter);
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);

    PADDLE_ENFORCE_GE(
        allocation_ptr->size(), framework::product(dim) * sizeof(T),

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -17,14 +17,6 @@ limitations under the License. */

 DECLARE_bool(use_stream_safe_cuda_allocator);

-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -32,14 +32,6 @@ limitations under the License. */

 #include "paddle/pten/core/dense_tensor.h"

-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {

 namespace framework {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
            paddle::memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(npu_pinned_place)
                .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
    npu_pinned_allocator->RecordEvent(
        allocation,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
            paddle::memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(npu_pinned_place)
                .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
    npu_pinned_allocator->RecordEvent(
        allocation,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -241,8 +240,7 @@ void TensorFromVector(const std::vector<T>& src,
            paddle::memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(npu_pinned_place)
                .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
    npu_pinned_allocator->RecordEvent(
        allocation,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -312,8 +310,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
            paddle::memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(npu_pinned_place)
                .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
    npu_pinned_allocator->RecordEvent(
        allocation,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
  auto t_place = tensor->place();

  paddle::framework::Tensor out;
-  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
-      static_cast<void *>(data), ele_num * sizeof(T),
-      paddle::platform::CPUPlace());
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
  out.ResetHolder(mem_allocation);

  if (paddle::platform::is_cpu_place(t_place)) {

--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
  size_t memory_size =
      GetLiteTensorNumel(*src) *
      framework::SizeOfType(GetNativePrecisionType(src->precision()));
-  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, memory_size,
-                                         GetNativePlace(src->target())));
+  std::shared_ptr<pten::Allocation> holder(new pten::Allocation(
+      src_raw_data, memory_size, GetNativePlace(src->target())));
  dst->Resize(paddle::framework::make_ddim(src->shape()));
  SetLoD(dst->mutable_lod(), src->lod());
  dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision()));

--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -23,7 +23,7 @@ namespace allocation {
 // For memory address alignment
 class AlignedAllocation : public Allocation {
 public:
-  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
+  AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset)
      : Allocation(
            reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
            underlying_allocation->base_ptr(),
@@ -32,7 +32,7 @@ class AlignedAllocation : public Allocation {
        underlying_allocation_(std::move(underlying_allocation)) {}

 private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
 };

 AlignedAllocator::AlignedAllocator(
@@ -52,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const {
  return underlying_allocator_->IsAllocThreadSafe();
 }

-Allocation* AlignedAllocator::AllocateImpl(size_t size) {
+pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) {
  auto raw_allocation = underlying_allocator_->Allocate(size + alignment_);
  size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_);
-  return new AlignedAllocation(std::move(raw_allocation), offset);
+  auto* p = new AlignedAllocation(
+      static_unique_ptr_cast<Allocation>(std::move(raw_allocation)), offset);
+  return p;
 }

-void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; }
+void AlignedAllocator::FreeImpl(pten::Allocation* allocation) {
+  delete allocation;
+}

 }  // namespace allocation
 }  // namespace memory

--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  Allocation* AllocateImpl(size_t size) override;
+  pten::Allocation* AllocateImpl(size_t size) override;

-  void FreeImpl(Allocation* allocation) override;
+  void FreeImpl(pten::Allocation* allocation) override;

 private:
  std::shared_ptr<Allocator> underlying_allocator_;

--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace memory {
 namespace allocation {

-bool Allocator::IsAllocThreadSafe() const { return false; }
-
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
+void Allocator::FreeImpl(pten::Allocation* allocation) {
+  static_cast<Allocation*>(allocation)
+      ->TopDecoratedAllocator()
+      ->Free(allocation);
 }

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/allocator.h"

 DECLARE_string(allocator_strategy);

@@ -80,30 +81,19 @@ class Allocator;
 * e.g., something what is done in AlignedAllocator, etc.
 * In this case, we should declare a derived class of Allocation, which
 * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
+ * Therefore, `decorated_allocators_` of the new Allocation object
+ * would
 * be a new chain, differing from the underlying Allocation object.
 */
-class Allocation {
+class Allocation : public pten::Allocation {
 public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
-  inline Allocation(void* ptr, void* base_ptr, size_t size,
-                    platform::Place place)
-      : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}
-
-  Allocation(const Allocation& o) = delete;
-  Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
-
-  inline void* base_ptr() const {
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : pten::Allocation(ptr, size, place), base_ptr_(ptr) {}
+  Allocation(void* ptr, void* base_ptr, size_t size,
+             const platform::Place& place)
+      : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
+
+  void* base_ptr() const {
    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
                      paddle::platform::errors::Unimplemented(
                          "base_ptr() is only implemented for auto_growth "
@@ -112,21 +102,6 @@ class Allocation {
    return base_ptr_;
  }

-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
-
-  inline const platform::Place& place() const { return place_; }
-
-  virtual ~Allocation() {}
-
 private:
  inline void RegisterDecoratedAllocator(Allocator* allocator) {
    decorated_allocators_.emplace_back(allocator);
@@ -139,10 +114,7 @@ class Allocation {
  }

 private:
-  void* ptr_;
  void* base_ptr_;  // the point that directly requested from system
-  size_t size_;
-  platform::Place place_;

  /**
   * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
@@ -162,53 +134,42 @@ class Allocation {
  friend class Allocator;
 };

+using AllocationPtr = pten::Allocator::AllocationPtr;
+using DecoratedAllocationPtr =
+    std::unique_ptr<Allocation, pten::Allocator::DeleterType>;
+
 // Base interface class of memory Allocator.
-class Allocator {
+class Allocator : public pten::Allocator {
 public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
-  };
-
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  static void AllocationDeleter(pten::Allocation* allocation) {
+    Allocator* allocator =
+        static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
+    allocator->Free(allocation);
+  }

  // Allocate an allocation.
  // size may be 0, but it would be too complex if we handle size == 0
  // in each Allocator. So we handle size == 0 inside AllocatorFacade
  // in our design.
-  inline AllocationPtr Allocate(size_t size) {
+  AllocationPtr Allocate(size_t size) override {
    auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
+    static_cast<Allocation*>(ptr)->RegisterDecoratedAllocator(this);
+    return AllocationPtr(ptr, AllocationDeleter);
  }

-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
+  void Free(pten::Allocation* allocation) {
+    static_cast<Allocation*>(allocation)->PopDecoratedAllocator();
    FreeImpl(allocation);
  }

-  inline uint64_t Release(const platform::Place& place) {
-    return ReleaseImpl(place);
-  }
-
-  // True if the `Allocate` is thread safe.
-  virtual bool IsAllocThreadSafe() const;
+  uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); }

 protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
+  virtual pten::Allocation* AllocateImpl(size_t size) = 0;
+  virtual void FreeImpl(pten::Allocation* allocation);
  virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; }
 };

-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
-
 inline size_t AlignedSize(size_t size, size_t alignment) {
  auto remaining = size % alignment;
  return remaining == 0 ? size : size + alignment - remaining;
@@ -220,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) {
  return diff == 0 ? 0 : alignment - diff;
 }

+template <typename Derived, typename Base, typename BaseDel>
+decltype(auto) static_unique_ptr_cast(std::unique_ptr<Base, BaseDel>&& p) {
+  static_assert(std::is_base_of<Base, Derived>::value,
+                "Derived type must derive from Base.");
+  auto d = static_cast<Derived*>(p.release());
+  return std::unique_ptr<Derived, BaseDel>(d, p.get_deleter());
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -94,7 +94,7 @@ class CUDAGraphAllocator
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
-                      AllocationPtr underlying_allocation)
+                      DecoratedAllocationPtr underlying_allocation)
        : Allocation(
              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
              underlying_allocation->size(), underlying_allocation->place()),
@@ -103,7 +103,7 @@ class CUDAGraphAllocator

   private:
    std::shared_ptr<Allocator> allocator_;
-    AllocationPtr underlying_allocation_;
+    DecoratedAllocationPtr underlying_allocation_;
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
@@ -116,12 +116,14 @@ class CUDAGraphAllocator
  }

 protected:
-  Allocation* AllocateImpl(size_t size) {
+  pten::Allocation* AllocateImpl(size_t size) {
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
-    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+    return new PrivateAllocation(this,
+                                 static_unique_ptr_cast<Allocation>(
+                                     underlying_allocator_->Allocate(size)));
  }

-  void FreeImpl(Allocation* allocation) {
+  void FreeImpl(pten::Allocation* allocation) {
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }
@@ -322,7 +324,7 @@ class AllocatorFacadePrivate {
    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
  }

-  void RecordStream(std::shared_ptr<Allocation> allocation,
+  void RecordStream(std::shared_ptr<pten::Allocation> allocation,
                    const gpuStream_t& stream) {
    if (allocation->size() == 0) {
      return;
@@ -339,7 +341,7 @@ class AllocatorFacadePrivate {
  }

  const gpuStream_t& GetStream(
-      const std::shared_ptr<Allocation>& allocation) const {
+      const std::shared_ptr<pten::Allocation>& allocation) const {
    const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
        dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
@@ -391,10 +393,10 @@ class AllocatorFacadePrivate {
    bool IsAllocThreadSafe() const override { return true; }

   protected:
-    Allocation* AllocateImpl(size_t size) override {
+    pten::Allocation* AllocateImpl(size_t size) override {
      return new Allocation(nullptr, 0, place_);
    }
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
+    void FreeImpl(pten::Allocation* allocation) override { delete allocation; }

   private:
    platform::Place place_;
@@ -820,9 +822,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }

-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size));
 }

 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
@@ -866,7 +868,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
      ->Release(place);
 }

-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_EQ(
@@ -884,14 +886,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
  }
 #endif
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<Allocation>(Alloc(place, size, s));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size, s));
 #else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }

 bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<Allocation>& allocation,
+    const std::shared_ptr<pten::Allocation>& allocation,
    const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_EQ(
@@ -962,7 +964,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
  return m_->GetAllocator(place, stream)->Release(place);
 }

-void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
+void AllocatorFacade::RecordStream(std::shared_ptr<pten::Allocation> allocation,
                                   const gpuStream_t& stream) {
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
@@ -983,7 +985,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
 }

 const gpuStream_t& AllocatorFacade::GetStream(
-    const std::shared_ptr<Allocation>& allocation) const {
+    const std::shared_ptr<pten::Allocation>& allocation) const {
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(

--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -42,6 +42,7 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
 public:
+  using Allocation = pten::Allocation;
  AllocatorFacade(const AllocatorFacade& o) = delete;
  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
  ~AllocatorFacade();

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
      allow_free_idle_chunk_(allow_free_idle_chunk) {}

-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
+    size_t unaligned_size) {
  size_t size = AlignedSize(unaligned_size, alignment_);
  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;

@@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
    size_t realloc_size = std::max(size, chunk_size_);

    try {
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
    } catch (BadAlloc &ex) {
      if (FLAGS_free_when_no_cache_hit) throw ex;
      FreeIdleChunks();
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
    }

    auto *chunk = &(*chunks_.rbegin());
@@ -104,7 +107,7 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
  return new BlockAllocation(block_it);
 }

-void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
  VLOG(10) << "Free " << allocation->size()
           << " bytes, ptr = " << allocation->ptr();
  std::lock_guard<SpinLock> guard(spinlock_);

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;

-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;

  // Release the memory block which is not used in pool.
  uint64_t ReleaseImpl(const platform::Place &place) override {
@@ -64,10 +64,10 @@ class AutoGrowthBestFitAllocator : public Allocator {
  };

  struct Chunk {
-    explicit Chunk(AllocationPtr allocation)
+    explicit Chunk(DecoratedAllocationPtr allocation)
        : allocation_(std::move(allocation)) {}

-    AllocationPtr allocation_;
+    DecoratedAllocationPtr allocation_;
    List<Block> blocks_;
  };


--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -28,12 +28,12 @@ namespace allocation {

 class RecordedAllocator : public Allocator {
 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    allocated_size_ += size;
    return new Allocation(malloc(size), size, platform::CPUPlace());
  }

-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
    allocated_size_ -= allocation->size();
    free(allocation->ptr());
    delete allocation;
@@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator {
  size_t AllocatedSize() const { return allocated_size_; }

 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    if (allocated_size_ + size > capacity_) {
      throw BadAlloc("", __FILE__, __LINE__);
    }
@@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator {
    return new Allocation(malloc(size), size, platform::CPUPlace());
  }

-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
    allocated_size_ -= allocation->size();
    free(allocation->ptr());
    delete allocation;

--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -37,7 +37,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
      size_t size = dis_(random_engine_);
      AllocationPtr allocation = Alloc(place_, size);

-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
      void* system_ptr =
          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
      EXPECT_EQ(base_ptr, system_ptr);
@@ -56,7 +56,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
        size_t size = dis_(random_engine_);
        AllocationPtr allocation = Alloc(place_, size);

-        void* base_ptr = allocation->base_ptr();
+        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
        void* system_ptr =
            platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
        EXPECT_EQ(base_ptr, system_ptr);
@@ -77,7 +77,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
      size_t size = dis_(random_engine_);
      AllocationPtr allocation = Alloc(place_, size);

-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
      void* system_ptr =
          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
      EXPECT_EQ(base_ptr, system_ptr);
@@ -91,7 +91,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {

  void ZeroSizeAllocTest() {
    AllocationPtr allocation = Alloc(place_, 0);
-    void* base_ptr = allocation->base_ptr();
+    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
    void* system_ptr =
        platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
    EXPECT_EQ(base_ptr, system_ptr);

--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) {
  }
 }

-BestFitAllocator::BestFitAllocator(Allocation* allocation)
+BestFitAllocator::BestFitAllocator(pten::Allocation* allocation)
    : allocation_(allocation) {
  details::Chunk chunk;
  chunk.size_ = allocation_->size();
@@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
  }
  return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(pten::Allocation* allocation) {
  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
  PADDLE_ENFORCE_NOT_NULL(
      bf_allocation,
@@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) {
  InsertFreeNode(chunk_it);
  delete allocation;
 }
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) {
  auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
  MapIt map_it;
  for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {

--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation {
 // the prev-chunk and the next-chunk when possible.
 class BestFitAllocator : public Allocator {
 public:
-  explicit BestFitAllocator(Allocation* allocation);
+  explicit BestFitAllocator(pten::Allocation* allocation);

  void* BasePtr() const { return allocation_->ptr(); }

@@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator {
  void InsertFreeNode(const ListIt& it);

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;

 private:
-  Allocation* allocation_;  // not owned
+  pten::Allocation* allocation_;  // not owned
  details::ChunkList chunks_;
  details::FreeChunkBin free_chunks_;
 };

--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) {

 bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }

-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+void BufferedAllocator::FreeImpl(pten::Allocation *allocation) {
  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  allocations_.emplace(allocation->size(), AllocationPtr(allocation));
+  allocations_.emplace(allocation->size(),
+                       AllocationPtr(allocation, Allocator::AllocationDeleter));
 }

-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) {
  {
    platform::LockGuardPtr<std::mutex> guard(mtx_);
    auto it = allocations_.lower_bound(size);

--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator {
  void FreeCache(size_t size);

 protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;

 private:
  std::shared_ptr<Allocator> underlying_allocator_;

--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -27,7 +27,7 @@ namespace memory {
 namespace allocation {

 inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
-    Allocation *allocation, bool thread_safe) {
+    pten::Allocation *allocation, bool thread_safe) {
  std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
  if (thread_safe) {
    allocator.reset(new LockedAllocator(std::move(allocator)));
@@ -68,7 +68,7 @@ class StubAllocator : public Allocator {
  size_t GetFreeCount() const { return destruct_count_; }

 protected:
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
    auto *alloc = dynamic_cast<StubAllocation *>(allocation);
    PADDLE_ENFORCE_NOT_NULL(
        alloc, platform::errors::InvalidArgument(
@@ -77,7 +77,7 @@ class StubAllocator : public Allocator {
    ++destruct_count_;
    delete allocation;
  }
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    ++construct_count_;
    if (size == 0) {
      return new StubAllocation(nullptr, 0, platform::CPUPlace());

--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {

 bool CPUAllocator::IsAllocThreadSafe() const { return true; }

-void CPUAllocator::FreeImpl(Allocation *allocation) {
+void CPUAllocator::FreeImpl(pten::Allocation *allocation) {
  void *p = allocation->ptr();
 #ifdef _WIN32
  _aligned_free(p);
@@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) {
  delete allocation;
 }

-Allocation *CPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUAllocator::AllocateImpl(size_t size) {
  void *p;
 #ifdef _WIN32
  p = _aligned_malloc(size, kAlignment);

--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -37,8 +37,8 @@ class CPUAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 };
 }  // namespace allocation
 }  // namespace memory

--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -32,7 +32,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
      platform::errors::PermissionDenied(
@@ -42,7 +42,7 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
  delete allocation;
 }

-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
  std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });

  void* ptr;

--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;

 private:
  platform::CUDAPlace place_;

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -41,7 +41,7 @@ namespace allocation {
 */
 class CUDADeviceContextAllocation : public Allocation {
 public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
+  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
      : Allocation(allocation->ptr(), allocation->base_ptr(),
                   allocation->size(), allocation->place()),
        underlying_allocation_(std::move(allocation)) {}
@@ -56,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation {
            << p_allocation;
    dev_ctx_->AddStreamCallback([p_allocation] {
      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
    });
  }

@@ -65,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation {
  }

 private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
  const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };

@@ -102,14 +102,14 @@ class CUDADeviceContextAllocator : public Allocator {
  }

 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    PADDLE_ENFORCE_NOT_NULL(
        default_stream_,
        platform::errors::PreconditionNotMet(
            "Default stream is not set for CUDADeviceContextAllocator"));
    platform::CUDADeviceGuard guard(place_.device);
-    auto allocation =
-        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
+    auto allocation = new CUDADeviceContextAllocation(
+        static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
@@ -121,7 +121,7 @@ class CUDADeviceContextAllocator : public Allocator {
    return allocation;
  }

-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }

 private:
  platform::CUDAPlace place_;

--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -101,7 +101,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(

 bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }

-void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
+void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
      platform::errors::PermissionDenied(
@@ -140,7 +140,7 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
  delete allocation;
 }

-Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
  size = AlignedSize(size, granularity_);

  CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;

--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -37,8 +37,8 @@ class CUDAVirtualMemAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;

 private:
  platform::CUDAPlace place_;

--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator(
  }
 }

-void LockedAllocator::FreeImpl(Allocation *allocation) {
+void LockedAllocator::FreeImpl(pten::Allocation *allocation) {
  platform::LockGuardPtr<std::mutex> guard(mtx_);
  underlying_allocator_->Free(allocation);
 }

-Allocation *LockedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *LockedAllocator::AllocateImpl(size_t size) {
  platform::LockGuardPtr<std::mutex> guard(mtx_);
  return underlying_allocator_->Allocate(size).release();
 }

--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -29,8 +29,8 @@ class LockedAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;

 private:
  std::shared_ptr<Allocator> underlying_allocator_;

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -790,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {

 namespace allocation {

-Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
  auto *tmp_alloc = new Allocation(ptr, size, place_);
  platform::MemEvenRecorder::Instance().PushMemRecord(
@@ -798,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
  return tmp_alloc;
 }

-void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
+void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
  boost::apply_visitor(
      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
      allocation->place());

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
  uint64_t ReleaseImpl(const platform::Place &place) override;

 private:

--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -22,7 +22,7 @@ namespace memory {
 namespace allocation {

 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(Allocation* allocation) {
+void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
      platform::errors::PermissionDenied(
@@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) {
  delete allocation;
 }

-Allocation* NPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation* NPUAllocator::AllocateImpl(size_t size) {
  std::call_once(once_flag_,
                 [this] { platform::SetNPUDeviceId(place_.device); });


--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -28,8 +28,8 @@ class NPUAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;

 private:
  platform::NPUPlace place_;

--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -26,7 +26,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
    platform::NPUEventQuery(event, &status);

    if (status == ACL_EVENT_STATUS_COMPLETE) {
-      Allocation *allocation = it->first;
+      auto *allocation = it->first;
      void *ptr = allocation->ptr();
      free(ptr);
      npu_events_.erase(it++);
@@ -38,7 +38,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
  }
 }

-Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
  std::lock_guard<std::mutex> lock(mtx_);
  ProcessEventsAndFree();
  void *ptr;
@@ -50,7 +50,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
  return new Allocation(ptr, size, platform::NPUPinnedPlace());
 }

-void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
  std::lock_guard<std::mutex> lock(mtx_);
  void *ptr = allocation->ptr();
  auto iter = npu_events_.find(allocation);
@@ -83,7 +83,7 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
  return static_cast<uint64_t>(0);
 }

-void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation,
                                     aclrtStream stream) {
  std::lock_guard<std::mutex> lock(mtx_);
  aclrtEvent event = nullptr;

--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator {
 public:
  bool IsAllocThreadSafe() const override { return true; }
  void ProcessEventsAndFree();
-  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  void RecordEvent(pten::Allocation *allocation, aclrtStream stream);
  constexpr static size_t kAlignment = 4096UL;

 protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
  uint64_t ReleaseImpl(const platform::Place &place) override;

 private:
-  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+  std::unordered_map<pten::Allocation *, aclrtEvent> npu_events_;
  mutable std::mutex mtx_;
 };


--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
@@ -26,7 +26,7 @@ void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 #endif
  delete allocation;
 }
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
  void *ptr;
 #ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));

--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 };

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard {
  size_t requested_size_;
 };

-void RetryAllocator::FreeImpl(Allocation* allocation) {
+void RetryAllocator::FreeImpl(pten::Allocation* allocation) {
  // Delete underlying allocation first.
  size_t size = allocation->size();
  underlying_allocator_->Free(allocation);
@@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) {
  }
 }

-Allocation* RetryAllocator::AllocateImpl(size_t size) {
+pten::Allocation* RetryAllocator::AllocateImpl(size_t size) {
  auto alloc_func = [&, this]() {
    return underlying_allocator_->Allocate(size).release();
  };

--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -45,8 +45,8 @@ class RetryAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
  uint64_t ReleaseImpl(const platform::Place& place) override {
    return underlying_allocator_->Release(place);
  }

--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -98,12 +98,12 @@ class DummyAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
        "Here is a test exception, always BadAlloc."));
  }

-  void FreeImpl(Allocation *) override {}
+  void FreeImpl(pten::Allocation *) override {}
 };

 TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,7 +19,7 @@ namespace memory {
 namespace allocation {

 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
    : Allocation(underlying_allocation->ptr(),
                 underlying_allocation->base_ptr(),
                 underlying_allocation->size(), underlying_allocation->place()),
@@ -116,7 +116,7 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {

 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }

-Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
  ProcessUnfreedAllocations();
  VLOG(8) << "Try allocate " << size << " bytes";
  AllocationPtr underlying_allocation;
@@ -136,13 +136,14 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
    throw;
  }
  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
-      std::move(underlying_allocation), default_stream_);
+      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
+      default_stream_);
  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
          << allocation->ptr();
  return allocation;
 }

-void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
+void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
  StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -34,7 +34,7 @@ namespace allocation {

 class StreamSafeCUDAAllocation : public Allocation {
 public:
-  StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
+  StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
                           gpuStream_t owning_stream);
  void RecordStream(const gpuStream_t &stream);
  bool CanBeFreed();
@@ -42,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation {
  const gpuStream_t &GetOwningStream() const;

 private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
  std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
  gpuStream_t owning_stream_;
  SpinLock outstanding_event_map_lock_;
@@ -57,8 +57,8 @@ class StreamSafeCUDAAllocator : public Allocator {
  bool IsAllocThreadSafe() const override;

 protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
  uint64_t ReleaseImpl(const platform::Place &place) override;

 private:

--- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
@@ -32,12 +32,12 @@ struct StubAllocator : public Allocator {
  size_t AllocNum() const { return alloc_num_; }

 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    ++alloc_num_;
    return new Allocation(new uint8_t[size], size, platform::CPUPlace());
  }

-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
    delete[] static_cast<uint8_t *>(allocation->ptr());
    delete allocation;
    --alloc_num_;

--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  Allocation* AllocateImpl(size_t size) override {
+  pten::Allocation* AllocateImpl(size_t size) override {
    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
        size);
  }
-  void FreeImpl(Allocation* allocation) override {
+  void FreeImpl(pten::Allocation* allocation) override {
    auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
    auto allocator_impl = tl_allocation->GetAllocator();
    allocator_impl->FreeImpl(tl_allocation);

--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -35,7 +35,8 @@ VirtualMemoryAutoGrowthBestFitAllocator::
      alignment_(alignment),
      place_(place) {}

-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(
+    size_t size) {
  std::lock_guard<SpinLock> guard(spinlock_);
  size = AlignedSize(size, alignment_);
  auto result = AllocFromFreeBlocks(size);
@@ -48,7 +49,8 @@ Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
  return result;
 }

-void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(
+    pten::Allocation *allocation) {
  std::lock_guard<SpinLock> guard(spinlock_);
  auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
  TryMergeBlock2Blocks(block_it);
@@ -225,7 +227,7 @@ void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) {
  }
 }

-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
    size_t size) {
  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
  if (iter != free_blocks_.end()) {

--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -60,12 +60,12 @@ class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
  bool IsAllocThreadSafe() const override { return true; }

 protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;

-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;

 private:
-  Allocation *AllocFromFreeBlocks(size_t size);
+  pten::Allocation *AllocFromFreeBlocks(size_t size);
  void ExtendAndMerge(size_t size);
  void TryMergeBlock2Blocks(std::list<Block>::iterator iter);


--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -28,7 +28,7 @@ class DeviceContext;

 namespace memory {

-using allocation::Allocation;
+using pten::Allocation;
 using allocation::Allocator;
 using allocation::AllocationPtr;


--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -336,9 +336,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    auto* data_alloc_released = data_alloc.release();
    auto* col_alloc_released = col_alloc.release();
    context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(col_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(col_alloc_released);
    });
 #endif
  }
@@ -466,9 +465,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
    auto* data_alloc_released = data_alloc.release();
    auto* cols_alloc_released = cols_alloc.release();
    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(cols_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(cols_alloc_released);
    });
 #endif
  }

--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
@@ -55,7 +55,7 @@ class MLUDeviceContextAllocation : public Allocation {
            << p_allocation;
    dev_ctx_->AddStreamCallback([p_allocation] {
      VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
    });
  }

@@ -91,7 +91,7 @@ class MLUDeviceContextAllocator : public Allocator {
  }

 protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
    PADDLE_ENFORCE_NOT_NULL(
        default_stream_,
        platform::errors::PreconditionNotMet(
@@ -105,7 +105,7 @@ class MLUDeviceContextAllocator : public Allocator {
    return allocation;
  }

-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }

 private:
  platform::MLUPlace place_;

--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
            paddle::memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(npu_pinned_place)
                .get());
-    paddle::memory::allocation::Allocation *allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation *allocation = npu_pinned_tensor.Holder().get();

    npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
  } else {

--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -53,7 +53,7 @@ size_t PyArray_Size_(PyObject* numpy_data) {
  return res;
 }

-class EagerNumpyAllocation : public paddle::memory::allocation::Allocation {
+class EagerNumpyAllocation : public pten::Allocation {
 public:
  explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
      : Allocation(

--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
-cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS
+cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
--- a/paddle/pten/api/lib/utils/allocator.cc
+++ b/paddle/pten/api/lib/utils/allocator.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/lib/utils/allocator.h"
-
-namespace paddle {
-namespace experimental {
-
-memory::Allocator::AllocationDeleter DefaultAllocator::deleter_;
-
-}  // namespace experimental
-}  // namespace paddle
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -22,14 +22,15 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {

-class DefaultAllocator : public pten::Allocator {
+class DefaultAllocator : public pten::deprecated::Allocator {
 public:
-  using Allocation = pten::Allocation;
+  using Allocation = pten::deprecated::Allocation;
  explicit DefaultAllocator(const paddle::platform::Place& place)
      : place_(place) {}

  static void Delete(Allocation* allocation) {
-    deleter_(allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
  }

  Allocation Allocate(size_t bytes_size) override {
@@ -42,7 +43,6 @@ class DefaultAllocator : public pten::Allocator {

 private:
  paddle::platform::Place place_;
-  static paddle::memory::Allocator::AllocationDeleter deleter_;
 };

 }  // namespace experimental

--- a/paddle/pten/api/lib/utils/storage.cc
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -20,14 +20,13 @@ namespace experimental {
 ExternalStorage::ExternalStorage(void* ptr,
                                 size_t size,
                                 const paddle::platform::Place& place)
-    : pten::Storage(
-          std::make_shared<paddle::memory::Allocation>(ptr, size, place)),
+    : pten::Storage(std::make_shared<pten::Allocation>(ptr, size, place)),
      size_(size) {}

 ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                                 size_t delta,
                                 size_t size)
-    : Storage(std::make_shared<paddle::memory::Allocation>(
+    : Storage(std::make_shared<pten::Allocation>(
          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
      size_(size) {
  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),

--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -307,7 +307,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
  dst->Resize(src->dims());
  dst->set_type(pten::TransToProtoVarType(src->dtype()));
  auto storage = src->release();
-  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+  std::shared_ptr<pten::Allocation> holder(
      new TensorStorage(std::move(storage)));
  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
  dst->set_offset(src->meta().offset);

--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -16,8 +16,10 @@ limitations under the License. */

 #include <cstdint>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/candidate/allocator.h"

 namespace pten {
+namespace deprecated {

 /// \brief Encapsulates strategies for access/addressing, allocation/
 /// deallocation and construction/destruction of objects.
@@ -147,4 +149,5 @@ inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
  return a->Allocate(n);
 }

+}  // namespace deprecated
 }  // namespace pten
--- a/paddle/pten/core/candidate/allocator.h
+++ b/paddle/pten/core/candidate/allocator.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include "paddle/fluid/platform/place.h"
+
+namespace pten {
+
+/// \brief Fancy pointer with deleter. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(Allocation*);
+
+  Allocation() = default;
+
+  // Don't own resources, only provide access.
+  Allocation(void* data, size_t size, const Place& place)
+      : ptr_(data), size_(size), place_(place) {}
+
+  // Own resources.
+  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
+      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
+
+  Allocation(Allocation&& other) noexcept { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) noexcept {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+
+  virtual ~Allocation() {
+    if (deleter_) {
+      deleter_(this);
+    }
+  }
+
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const noexcept { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const noexcept { return size_; }
+
+  void* operator->() const noexcept { return ptr_; }
+  operator bool() const noexcept { return ptr_; }
+  const Place& place() const noexcept { return place_; }
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+ protected:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
+  void* ptr_{nullptr};
+  size_t size_{};
+  DeleterFnPtr deleter_{nullptr};
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.ptr_, b.ptr_);
+  ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
+  ::std::swap(a.size_, b.size_);
+}
+
+class Allocator {
+ public:
+  using DeleterType = std::function<void(Allocation*)>;
+  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
+
+  virtual ~Allocator() = default;
+  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
+
+  virtual bool IsAllocThreadSafe() const { return false; }
+};
+
+}  // namespace pten
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,6 +60,8 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                    public TypeInfoTraits<TensorBase, DenseTensor> {
 public:
+  using Allocator = deprecated::Allocator;
+
  /// \brief Construct a dense tensor and allocate space.
  /// \param a The allocator used to allocate space.
  /// \param meta The meta data of dense tensor.

--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
 public:
  using Place = paddle::platform::Place;
+  using Allocator = deprecated::Allocator;

  explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}


--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace pten {
 namespace tests {

-class HostAllocatorSample : public pten::RawAllocator {
+class HostAllocatorSample : public pten::deprecated::RawAllocator {
 public:
  using Place = paddle::platform::Place;
  void* Allocate(size_t bytes_size) override {
@@ -36,8 +36,9 @@ class HostAllocatorSample : public pten::RawAllocator {
  Place place_{paddle::platform::CPUPlace()};
 };

-class FancyAllocator : public pten::Allocator {
+class FancyAllocator : public pten::deprecated::Allocator {
 public:
+  using Allocation = pten::deprecated::Allocation;
  static void Delete(Allocation* allocation) {
    ::operator delete(allocation->ptr());
  }
@@ -55,7 +56,7 @@ class FancyAllocator : public pten::Allocator {
 template <typename T>
 struct CustomAllocator {
  using value_type = T;
-  using Allocator = pten::RawAllocator;
+  using Allocator = pten::deprecated::RawAllocator;

  explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
      : alloc_(a) {}

--- a/paddle/pten/tests/core/test_allocator.cc
+++ b/paddle/pten/tests/core/test_allocator.cc
@@ -24,6 +24,10 @@ limitations under the License. */
 namespace pten {
 namespace tests {

+using RawAllocator = pten::deprecated::RawAllocator;
+using Allocator = pten::deprecated::Allocator;
+using Allocation = pten::deprecated::Allocation;
+
 template <typename T>
 bool host_allocator_test(size_t vector_size) {
  std::vector<T> src(vector_size);

--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true`
 if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n"
-    check_approval 2 6888866 39303645
+    check_approval 1 6888866 39303645
  fi

 HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
@@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    check_approval 1 22561442 22334008
  fi

-ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true`
-if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    ERROR_LINES=""
-    for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true`
-        if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n"
-        check_approval 2 6888866 39303645
-    fi
-fi
-
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"