diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6d8ba430bd0b9c9e48b4a80a07feb24b2da7d7b8..a02e53dcf764368601646a900833ac650c5bb31a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -365,6 +365,9 @@ class ExecutionContext {
     auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
         allocation_ptr, deleter);
 
+    PADDLE_ENFORCE(
+        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+        "The AllocationPtr must be TemporaryAllocation.");
     PADDLE_ENFORCE_GE(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 0f6014ae8aa28f090cb51401ee2cb0772bca7a45..ac77c3d2a500816a4eb41ed13f23ee628290f287 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
@@ -38,20 +37,30 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
-
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS
+        ${AllocatorFacadeDeps}
+        cpu_allocator
+        locked_allocator
+        best_fit_allocator
+        aligned_allocator
+        auto_increment_allocator
+        zero_size_allocator
+        conditional_allocator
+        retry_allocator
+        buffered_allocator
+        allocator_strategy
+        legacy_allocator
+        )
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 
-cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
-
 cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
 
 cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index b536d4276e3b6236d0748eee588d345dd15c6954..064acd06e71da98802126913e0af843cfbf717e7 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
         underlying_allocator_->Allocate(size + kAlignment, attr);
     return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
   }
-
-  void FreeImpl(Allocation* allocation) override { delete allocation; }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 5a5253d911abc722c026730e7e88eb326bb82afd..8fb8a5fb897a736d7515951ba08c633da9a7706c 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
 
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
   auto ptr = AllocateImpl(size, attr);
-  ptr->RegisterDecoratedAllocator(this);
+  ptr->set_allocator(this);
   return AllocationPtr(ptr);
 }
 
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
-}
-
-void Allocator::Free(Allocation* allocation) {
-  allocation->PopDecoratedAllocator();
-  FreeImpl(allocation);
-}
+void Allocator::Free(Allocation* allocation) { delete allocation; }
 
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 
 void AllocationDeleter::operator()(Allocation* allocation) const {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
+  auto* allocator = allocation->allocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 33b816b90812d7fedc450a67743b5d7d20579302..3465278935f7ce05456e94bb3a7d1ae9f114ff96 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -46,56 +46,13 @@ class Allocator;
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
-
-/**
- * Allocation is returned by Allocator::Allocate() method.
- *
- * An allocator may be decorated by another allocator. For example, we can
- * decorate
- * a RetryAllocator to any allocator to perform allocation retry when first
- * allocation request fails.
- *
- * Explanations of Allocator design is as follows:
- *
- * Suppose we have an allocator which is decorated by several allocators:
- *
- *   A(1) <- A(2) <- A(3) <- ... <- A(n)
- *
- * , and the public allocator is A(1).
- *
- * The allocation process would be:
- *
- *   A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
- *
- * , and the free process would be:
- *
- *   A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
- *
- * Therefore, we should record the allocator chain when allocating, so
- * that we can free the allocation in the reverse order of allocator chain.
- * The field `decorated_allocators_` is used to record this chain.
- *
- * Another example is that we want to add additional fields in Allocation,
- * e.g., something what is done in AlignedAllocator, etc.
- * In this case, we should declare a derived class of Allocation, which
- * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
- * be a new chain, differing from the underlying Allocation object.
- */
 class Allocation {
  public:
   Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {
-    // NOTE(zjl): Since decorated_allocators_ is usually a small vector
-    // We reserve a small buffer to it to prevent frequent heap allocation
-    // Not quite sure whether we need something like gtl vector.
-    decorated_allocators_.reserve(8);
-  }
+      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
 
   // Returns the holding pointer.
   // NOTE: For performance consideration, it is better not to make this method
@@ -117,31 +74,17 @@ class Allocation {
 
   const platform::Place& place() const { return place_; }
 
-  virtual ~Allocation();
-
- private:
-  const std::vector<Allocator*>& DecoratedAllocators() const {
-    return decorated_allocators_;
-  }
-
-  inline void RegisterDecoratedAllocator(Allocator* allocator) {
-    decorated_allocators_.push_back(allocator);
-  }
+  Allocator* allocator() { return allocator_; }
 
-  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
+  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
 
-  inline Allocator* TopDecoratedAllocator() {
-    return decorated_allocators_.back();
-  }
+  virtual ~Allocation();
 
  private:
+  Allocator* allocator_;
   void* ptr_;
   size_t size_;
   platform::Place place_;
-  std::vector<Allocator*> decorated_allocators_;
-
-  friend class Allocator;
-  friend class AllocationDeleter;
 };
 
 using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
@@ -191,12 +134,9 @@ class Allocator {
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
-  // This function should not be called outside
-  void Free(Allocation* allocation);
-
  protected:
+  virtual void Free(Allocation* allocation);
   virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
-  virtual void FreeImpl(Allocation* allocation);
 
  private:
   friend class AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 09328aded58cb0cccd9de0aba399f5c49313042f..a3b73e3ba31c89c2a94955b0fea64df4ab0ffc26 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -49,17 +49,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-static inline std::shared_ptr<Allocator> WrapRetryAllocator(
-    std::shared_ptr<Allocator> allocator, int64_t retry_time) {
-  if (retry_time > 0) {
-    auto* retry_allocator =
-        new RetryAllocator(std::move(allocator), retry_time);
-    allocator.reset(retry_allocator);
-  }
-
-  return allocator;
-}
-
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
  public:
@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    std::shared_ptr<Allocator> allocator(new LockedAllocator(
-        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
+    std::unique_ptr<Allocator> allocator(new LockedAllocator(
+        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    allocator = WrapRetryAllocator(allocator, retry_time_);
+    if (retry_time_ > 0) {
+      auto* retry_allocator =
+          new RetryAllocator(std::move(allocator), retry_time_);
+      allocator.reset(retry_allocator);
+    }
 
     return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
   }
@@ -197,23 +190,13 @@ class AllocatorFacadePrivate {
   ~AllocatorFacadePrivate() = default;
 
   AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
-      case AllocatorStrategy::kLegacy: {
-        InitLegacyAllocator();
-        break;
-      }
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitCPUAllocator();
-        InitCUDAAllocator();
-        InitCUDAPinnedAllocator();
-        WrapZeroSizeAllocator();
-        break;
-      }
-      default: {
-        PADDLE_THROW("Unsupported allocator strategy: %d",
-                     static_cast<int>(strategy));
-      }
+    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
+      InitLegacyAllocator();
+    } else {
+      InitCPUAllocator();
+      InitCUDAAllocator();
+      InitCUDAPinnedAllocator();
+      WrapZeroSizeAllocator();
     }
   }
 
@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
+                                     AllocationDeleter());
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index fff94c01e709613603eea7150a08df3c2611dec2..8cebda9005b29b5b3259de0830c42eb10ef90e66 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,22 +19,16 @@
 DEFINE_string(
     allocator_strategy, "legacy",
     "The allocation strategy. Legacy means the original allocator of Fluid."
-    "naive_best_fit means the experimental best fit allocator. "
-    "allocator. Enum in [legacy, naive_best_fit].");
+    "New means the experimental allocators of Fluid. in [legacy, new]");
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 static AllocatorStrategy GetStrategyFromFlag() {
-  if (FLAGS_allocator_strategy == "legacy") {
-    return AllocatorStrategy::kLegacy;
-  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
-    return AllocatorStrategy::kNaiveBestFit;
-  } else {
-    PADDLE_THROW("Unsupported allocator strategy: %s",
-                 FLAGS_allocator_strategy);
-  }
+  return FLAGS_allocator_strategy == "legacy"
+             ? AllocatorStrategy::kLegacy
+             : AllocatorStrategy::kNaiveBestFit;
 }
 
 AllocatorStrategy GetAllocatorStrategy() {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index d87dd9a4b6df288065389a335a9ddb4047dd096a..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::Free(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(bf_allocation,
                           "The input allocation is not BestFitAllocation.");
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index c137438c0c35a575d366a1dfdf950262f711defa..4f10f2b53e8543d4197097f1cae8de765bceeb0f 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index e04c0aa34b1cd6200806cc2a012161e3478eca0b..fc75abc9dfee6c9df5bc87faa493002cc1fe6298 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
+BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
     : underlying_allocator_(std::move(allocator)) {
   PADDLE_ENFORCE_NOT_NULL(
       underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must not be null");
+      "Underlying allocator of BufferedAllocator must be unmanaged");
   if (underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
   while (!allocations_.empty()) {  // free the largest
     auto it = --allocations_.end();
     cur += it->second->size();
-    underlying_allocator_->Free(it->second.release());
+    delete it->second.release();
     allocations_.erase(it);
     if (cur >= size) return;
   }
 }
 
-bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
-
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+bool BufferedAllocator::IsAllocThreadSafe() const {
+  return this->underlying_allocator_->IsAllocThreadSafe();
+}
+void BufferedAllocator::Free(Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   allocations_.emplace(allocation->size(), AllocationPtr(allocation));
 }
-
 Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
     if (it != allocations_.end() && it->first < size * 2) {
       AllocationPtr result(std::move(it->second));
       allocations_.erase(it);
-      return result.release();
+      return new AllocationWithUnderlying(std::move(result));
     }
   }
 
   try {
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   } catch (BadAlloc &) {
     FreeCache(size);
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index c728395705842d29a7b2a8441a7048a7e4bf5e6b..d44a3f85beba712b1e735ba14008689bce7d0d64 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -31,7 +31,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
+  explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
 
   ~BufferedAllocator();
 
@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::multimap<size_t, AllocationPtr> allocations_;
   std::unique_ptr<std::mutex> mtx_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 854a117b0e7532962d5e0c95fd947527ac3b307a..c8bd5292ca0f6c3e7ebdc7f5908523b0b7c8ba3a 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -65,7 +66,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void Free(Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(alloc);
     if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 90c49c87a677aa38bce35774b3a7bb698e6f43e7..cc81a6f7b8b1950b07b6fb1571b53d9b5ddb1b9f 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -20,27 +20,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+CPUAllocation::CPUAllocation(void *ptr, size_t size)
+    : Allocation(ptr, size, platform::CPUPlace()) {}
+
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
-  void *p = allocation->ptr();
-#ifdef _WIN32
-  _aligned_free(p);
-#else
-  free(p);
-#endif
+void CPUAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
+  free(allocation->ptr());
   delete allocation;
 }
 
 Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  void *p;
-#ifdef _WIN32
-  p = _aligned_malloc(size, kAlignment);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
-                    size);
-#endif
-  return new Allocation(p, size, platform::CPUPlace());
+  void *ptr;
+  auto status = posix_memalign(&ptr, kAlignment, size);
+  if (UNLIKELY(status) != 0) {
+    throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
+                                   size, status));
+  }
+  return new CPUAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 3eb1416b0efa9327f2052e1f128359bc93f94986..26d3643f4edff1f2d71b1c761e915a6dacb485ad 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -31,13 +31,19 @@ namespace allocation {
 //
 // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
 // an open-sourced allocator into Paddle.
+class CPUAllocator;
+class CPUAllocation : public Allocation {
+ public:
+  CPUAllocation(void* ptr, size_t size);
+};
+
 class CPUAllocator : public Allocator {
  public:
-  constexpr static size_t kAlignment = 4096UL;
+  constexpr static size_t kAlignment = 64u;
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 895a24a6a2a6b8e399ec2ace48136d1ef16c62f6..430bf0be98e08787ac4412a8b6e0fcc310ffe2b4 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,14 +23,15 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::Free(Allocation* allocation) {
   platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
+  auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
+  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
                     place_);
   PADDLE_ENFORCE(cudaFree(allocation->ptr()));
   delete allocation;
 }
-
 Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
         "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
         status, cudaGetErrorString(status)));
   }
-  return new Allocation(ptr, size, platform::Place(place_));
+  return new CUDAAllocation(ptr, size, platform::Place(place_));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 580a2d1df1d5997a27180740393741ec8973bf18..63726f5820b1c81565117c7a9bf798c17c9681f6 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,6 +20,13 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// CUDA System allocator and allocation.
+// Just a flag type.
+class CUDAAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
 class CUDAAllocator : public Allocator {
  public:
   explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 0dc2de37467b7e7d23c88b4a255c14795db4c275..514ac7883ad2effdf3518be8afe3f448a5ac10b2 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 }
 
 #ifdef PADDLE_WITH_CUDA
-class GPUBuddyAllocatorList {
- public:
-  GPUBuddyAllocatorList()
-      : allocators_(platform::GetCUDADeviceCount()),
-        flags_(platform::GetCUDADeviceCount()) {
-    allocation::GPUMemMonitor.Initialize(allocators_.size());
-  }
+BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator **a_arr = nullptr;
+  static std::vector<int> devices;
+
+  std::call_once(init_flag, [gpu_id]() {
+    devices = platform::GetSelectedDevices();
+    int gpu_num = devices.size();
 
-  BuddyAllocator *Get(size_t dev_id) {
-    PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
-    std::call_once(flags_[dev_id], [this, dev_id] {
+    allocation::GPUMemMonitor.Initialize(devices.size());
+
+    a_arr = new BuddyAllocator *[gpu_num];
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int dev_id = devices[i];
+      a_arr[i] = nullptr;
       platform::SetDeviceId(dev_id);
-      allocators_[dev_id] = new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::GPUAllocator(dev_id)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                        new detail::GPUAllocator(dev_id)),
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
 
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList {
                << FLAGS_initial_gpu_memory_in_mb
                << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
                << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-    return allocators_[dev_id];
-  }
-
- private:
-  std::vector<BuddyAllocator *> allocators_;
-  std::vector<std::once_flag> flags_;
-};
+    }
+  });
 
-BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
-  static GPUBuddyAllocatorList allocators;
   platform::SetDeviceId(gpu_id);
-  return allocators.Get(gpu_id);
+  auto pos = std::distance(devices.begin(),
+                           std::find(devices.begin(), devices.end(), gpu_id));
+  return a_arr[pos];
 }
 #endif
 
@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 #ifdef PADDLE_WITH_CUDA
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr && size > 0) {
+  if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
     size_t avail, total;
@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   return tmp_alloc;
 }
 
-void LegacyAllocator::FreeImpl(Allocation *allocation) {
+void LegacyAllocator::Free(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index 27cd42ea35012f07ae7db79c46d767138ddaafff..d9bdae153da6439598f76f5cac226897e6e0c596 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
 
  protected:
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index c43099cc88f839ad92d36774d49aafd7192f916f..62d768c580607f32db8c49eb3d62f0f32c9dbeeb 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -17,7 +17,6 @@
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
-
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -25,24 +24,26 @@ namespace allocation {
 bool LockedAllocator::IsAllocThreadSafe() const { return true; }
 
 LockedAllocator::LockedAllocator(
-    std::shared_ptr<Allocator> underlying_allocator)
+    std::unique_ptr<Allocator> &&underlying_allocator)
     : underlying_allocator_(std::move(underlying_allocator)) {
   PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
   if (!underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
 }
-
-void LockedAllocator::FreeImpl(Allocation *allocation) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  underlying_allocator_->Free(allocation);
+void LockedAllocator::Free(Allocation *allocation) {
+  {
+    platform::LockGuardPtr<std::mutex> guard(mtx_);
+    reinterpret_cast<AllocationWithUnderlying *>(allocation)
+        ->allocation_.reset();  // Destroy inner allocation
+  }
+  delete allocation;
 }
-
 Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return underlying_allocator_->Allocate(size, attr).release();
+  return new AllocationWithUnderlying(
+      underlying_allocator_->Allocate(size, attr));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index b735ccef101417b3f880eb6dcdd9964cffbe875c..4967b9bb8d3ad101cff4657b0a45b49b76e2deb2 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -24,15 +24,15 @@ namespace allocation {
 // A allocator to make underlying allocator thread safe.
 class LockedAllocator : public Allocator {
  public:
-  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
+  explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
deleted file mode 100644
index 3334589a4beb407447cf89c173f6128654bb245a..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-
-#ifdef PADDLE_WITH_CUDA
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-
-DECLARE_string(allocator_strategy);
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
-  FLAGS_allocator_strategy = "naive_best_fit";
-
-  auto &instance = AllocatorFacade::Instance();
-  platform::Place place;
-  size_t size = 1024;
-
-  {
-    place = platform::CPUPlace();
-    size = 1024;
-    auto cpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(cpu_allocation, nullptr);
-    ASSERT_NE(cpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(cpu_allocation->place(), place);
-    ASSERT_EQ(cpu_allocation->size(), size);
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    place = platform::CUDAPlace(0);
-    size = 1024;
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    // Allocate 2GB gpu memory
-    place = platform::CUDAPlace(0);
-    size = 2 * static_cast<size_t>(1 << 30);
-    auto gpu_allocation = instance.Alloc(place, size);
-    ASSERT_NE(gpu_allocation, nullptr);
-    ASSERT_NE(gpu_allocation->ptr(), nullptr);
-    ASSERT_EQ(gpu_allocation->place(), place);
-    ASSERT_GE(gpu_allocation->size(), size);
-  }
-
-  {
-    place = platform::CUDAPinnedPlace();
-    size = (1 << 20);
-    auto cuda_pinned_allocation =
-        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
-    ASSERT_NE(cuda_pinned_allocation, nullptr);
-    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
-    ASSERT_EQ(cuda_pinned_allocation->place(), place);
-    ASSERT_GE(cuda_pinned_allocation->size(), size);
-  }
-#endif
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5a3d817211750d3e19e65344d1eab5a96800c674..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,15 +20,20 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
   PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
   delete allocation;
 }
 Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
                                              Allocator::Attr attr) {
+  // PADDLE_ENFORCE_EQ(
+  //    attr, kCrossDevice,
+  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
+
   void *ptr;
   PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
-  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
+  return new CPUPinnedAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index deeb55a8fb0396a312286f5c2692114e9e4afc8d..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -20,12 +20,18 @@ namespace memory {
 namespace allocation {
 
 // Allocator uses `cudaHostAlloc`
+class CPUPinnedAllocation : public Allocation {
+ public:
+  CPUPinnedAllocation(void *ptr, size_t size)
+      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
+};
+
 class CPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
+  void Free(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 7e888988f9602e362d73f64c1b45552e84e3349c..981705051b449e6a35c2dcce9138dc2efae52920 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -18,15 +18,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+bool RetryAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+
+void RetryAllocator::Free(Allocation* allocation) {
   // Delete underlying allocation first.
-  underlying_allocator_->Free(allocation);
-  cv_.notify_all();
+  reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
+  {
+    // notify all waited allocators, they can try to allocate memory after free.
+    std::lock_guard<std::mutex> lock(mutex_);
+    cv_.notify_all();
+  }
+  delete allocation;
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto alloc_func = [&, this]() {
-    return underlying_allocator_->Allocate(size, attr).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   };
   // In fact, we can unify the code of allocation success and failure
   // But it would add lock even when allocation success at the first time
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 379f576d6e1ed8f256a0233b203423a487ee73e4..6ab8ca8fbec0077b2c95cf727731ca0095716197 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -25,25 +25,32 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class RetryAllocator;
+
 class RetryAllocator : public Allocator {
  public:
-  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
+  RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
       : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
+    EnforceCheck();
+  }
+
+  bool IsAllocThreadSafe() const override;
+
+ private:
+  void EnforceCheck() {
     PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_,
-        "UnderlyingAllocator of RetryAllocator must not be null");
+        underlying_allocator_.get(),
+        "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
     PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
                    "UnderlyingAllocator of RetryAllocator must be thread-safe");
   }
 
-  bool IsAllocThreadSafe() const override { return true; }
-
  protected:
-  void FreeImpl(Allocation* allocation) override;
+  void Free(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::chrono::milliseconds retry_time_;
   std::mutex mutex_;
   std::condition_variable cv_;
@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
   // For debug, We can add an atomic integer to record how many memory sizes are
   // waited to allocate
   // std::atomic<size_t> waited_allocate_size_{0};
+
+  friend class RetryAllocation;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
index 39743bcb10c700c9a8446b9040c8a8707d57ec7d..cb2df1a029815478bbc9d3b09425f3ef145c5fb3 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
 
 Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   if (size == 0) {
-    return new Allocation(nullptr, 0, place_);
+    return new ZeroSizeAllocation(place_);
   } else {
     return underlying_allocator_->Allocate(size, attr).release();
   }
 }
-
-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (allocation->size() == 0) {
-    delete allocation;
-  } else {
-    underlying_allocator_->Free(allocation);
-  }
-}
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index 08a7a06dbf290b55994a407fe478f792b0c0964a..0f01dfcdf5b1179c52d8c0204b655cab10770d95 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -24,6 +24,12 @@ namespace allocation {
 // The allocator handles the request's size is zero. Allocator will always
 // return an allocation even the request size is zero. However, the
 // allocation.ptr() is nullptr
+class ZeroSizeAllocation : public Allocation {
+ public:
+  explicit ZeroSizeAllocation(const platform::Place& p)
+      : Allocation(nullptr, 0, p) {}
+};
+
 class ZeroSizeAllocator : public Allocator {
  public:
   ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator {
 
  protected:
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index ddde7baf4cf3b44ac5d8a22fcc98acef50294575..250efe70fd631fc58be7b7c81c75ea53bd55b5cd 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -30,31 +30,38 @@ namespace paddle {
 namespace platform {
 namespace alloc = memory::allocation;
 
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
     t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
     wait_delete_mem_ = 0;
   }
 
-  alloc::AllocationDeleter deleter;
   for (auto tmp : *t_allocations) {
     VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
              << " size: " << tmp.second->size();
-    deleter(tmp.second);
+    delete tmp.second;
   }
 }
 
-void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
   if (platform::is_gpu_place(temp_allocation->place())) {
     PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
                    "The place should be the same.");
@@ -78,7 +85,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
   }
   VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
            << " size: " << temp_allocation->size();
-  alloc::AllocationDeleter()(temp_allocation);
+  delete temp_allocation;
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
@@ -113,9 +120,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
   }
   // If not find the the available allocation, get allocation from
   // AllocatorFacadeInstance.
-  auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
   VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
-  return temp_mem.release();
+  return temp_mem;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 912d45eaf17fe8c05840995275dd3e2e688b38ef..f8a43b889d58d5e027aac8e08324cf51b7d82913 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -23,6 +23,14 @@
 namespace paddle {
 namespace platform {
 
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
 /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
  * which used by CUDA's async operation.
  *
@@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   void SetCallback(const std::function<void()> &callback);
 
  protected:
-  void FreeImpl(memory::allocation::Allocation *allocation) override;
+  void Free(memory::allocation::Allocation *allocation) override;
 
   memory::allocation::Allocation *AllocateImpl(
       size_t size, memory::allocation::Allocator::Attr attr) override;
@@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   platform::Place place_;
   // When the allocation is not held by any variable, it should be placed
   // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
-      temp_mem_map_{nullptr};
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
+      nullptr};
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dca40edf0bbe334d7a9306b9b6b259b0bc9a6f5d..7bf089637862c969d27f957a630469d1644222bf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) {
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("_clear", &Tensor::clear)
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 66b768665b6d0b97b4ca1470020132bfc9576bbb..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
-inline std::string HumanReadableSize(double f_size) {
+template <typename T>
+std::string HumanReadableSize(T size) {
   size_t i = 0;
+  double f_size = static_cast<double>(size);
   double orig = f_size;
   const std::vector<std::string> units(
       {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size >= 1024) {
+  while (f_size > 1024) {
     f_size /= 1024;
     i++;
   }