diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8a86813e9362d7b82c2023428a35a1982adb0508..24ab33a1442e261bf98de13c2dd4b1f9af630ec8 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -404,9 +404,6 @@ class ExecutionContext {
     auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
         allocation_ptr, deleter);
 
-    PADDLE_ENFORCE(
-        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
-        "The AllocationPtr must be TemporaryAllocation.");
     PADDLE_ENFORCE_GE(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
diff --git a/paddle/fluid/framework/small_stack.h b/paddle/fluid/framework/small_stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..6919ff7a28aac909511d0a8fd983df3eeaf3ca13
--- /dev/null
+++ b/paddle/fluid/framework/small_stack.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <deque>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T, size_t N>
+class SmallStack {
+  static_assert(N > 0, "N must be larger than 0");
+
+ public:
+  inline void push(const T& item) {
+    if (size_ < N) {
+      head_[size_] = item;
+    } else {
+      tail_.emplace_back(item);
+    }
+    ++size_;
+  }
+
+  inline void pop() {
+    PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack.");
+    if (size_ > N) {
+      tail_.pop_back();
+    }
+    --size_;
+  }
+
+  inline const T& top() const {
+    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+    return size_ <= N ? head_[size_ - 1] : tail_.back();
+  }
+
+  inline T& top() {
+    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+    return size_ <= N ? head_[size_ - 1] : tail_.back();
+  }
+
+  inline bool empty() const { return size_ == 0; }
+
+  inline size_t size() const { return size_; }
+
+  // This API can only be used in unittest
+  T& operator[](size_t i) { return i < N ? head_[i] : tail_[i - N]; }
+
+  const T& operator[](size_t i) const {
+    return i < N ? head_[i] : tail_[i - N];
+  }
+
+ private:
+  T head_[N];
+  std::deque<T> tail_;
+  size_t size_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4b7b9064dcde9b5209264257d51bbd976ba8eb85..5ff91dfbc28be8de6bc4289544ebfad1269857e6 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,8 +3,11 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
+cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
+cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
@@ -53,6 +56,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
         conditional_allocator
         retry_allocator
         buffered_allocator
+        multi_bin_buffered_allocator
         allocator_strategy
         legacy_allocator
         )
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index fc1a8e9247b16374037bfde44449fd552b44c6b4..602d85bf9e8811bb7b1effc2292a84be1301b14c 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -93,6 +93,8 @@ class AlignedAllocator : public ThinAlignedAllocator {
         underlying_allocator_->Allocate(size + kAlignment, attr);
     return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
   }
+
+  void FreeImpl(Allocation* allocation) override { delete allocation; }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 8fb8a5fb897a736d7515951ba08c633da9a7706c..664b3b8420f7590dc605cb8c67059f334fc76139 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -26,17 +26,28 @@ Allocator::~Allocator() {}
 bool Allocator::IsAllocThreadSafe() const { return false; }
 
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
+  VLOG(2) << "Alloc allocation on " << typeid(*this).name();
   auto ptr = AllocateImpl(size, attr);
-  ptr->set_allocator(this);
+  ptr->RegisterAllocatorChain(this);
+  VLOG(2) << "Alloc success";
   return AllocationPtr(ptr);
 }
 
-void Allocator::Free(Allocation* allocation) { delete allocation; }
+void Allocator::FreeImpl(Allocation* allocation) {
+  auto* allocator = allocation->TopAllocator();
+  allocator->Free(allocation);
+}
+
+void Allocator::Free(Allocation* allocation) {
+  VLOG(2) << "Free allocation on " << typeid(*this).name();
+  allocation->PopAllocator();
+  FreeImpl(allocation);
+}
 
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 
 void AllocationDeleter::operator()(Allocation* allocation) const {
-  auto* allocator = allocation->allocator();
+  auto* allocator = allocation->TopAllocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index f2b6f438c382275cab4ecf9aceea1c55e5885dee..f74fab3c7516db1ce9f49fad091125265d78e5f2 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -15,6 +15,8 @@
 #pragma once
 #include <memory>
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/small_stack.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -47,10 +49,12 @@ class Allocator;
 class Allocation {
  public:
   Allocation(void* ptr, size_t size, platform::Place place)
-      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
+      : ptr_(ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
+  Allocation(Allocation&& o) = delete;
+  Allocation& operator=(Allocation&& o) = delete;
 
   // Returns the holding pointer.
   // NOTE: For performance consideration, it is better not to make this method
@@ -72,17 +76,34 @@ class Allocation {
 
   const platform::Place& place() const { return place_; }
 
-  Allocator* allocator() { return allocator_; }
+  virtual ~Allocation();
 
-  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
+  // This function should only be used in unittest
+  std::vector<Allocator*> GetAllocatorChain() const {
+    std::vector<Allocator*> allocators;
+    for (size_t i = 0; i < allocator_chain_.size(); ++i) {
+      allocators[i] = allocator_chain_[i];
+    }
+    return allocators;
+  }
 
-  virtual ~Allocation();
+ private:
+  inline void RegisterAllocatorChain(Allocator* allocator) {
+    allocator_chain_.push(allocator);
+  }
+
+  inline void PopAllocator() { allocator_chain_.pop(); }
+
+  inline Allocator* TopAllocator() { return allocator_chain_.top(); }
 
  private:
-  Allocator* allocator_;
   void* ptr_;
   size_t size_;
   platform::Place place_;
+  framework::SmallStack<Allocator*, 8> allocator_chain_;
+
+  friend class Allocator;
+  friend class AllocationDeleter;
 };
 
 using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
@@ -132,9 +153,12 @@ class Allocator {
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
+  // This function should not be called outside
+  void Free(Allocation* allocation);
+
  protected:
-  virtual void Free(Allocation* allocation);
   virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
+  virtual void FreeImpl(Allocation* allocation);
 
  private:
   friend class AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index ea0b729dc6f62f517877e060cb0ecbe5c1d22e61..1a9f5e8f7f018029128b7dc9ddd0cb07549cc0b8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -43,6 +44,8 @@ DEFINE_int64(
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
 
+DEFINE_bool(enable_buffered_allocator, false, "Enable buffered_allocator");
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -110,8 +113,8 @@ class ChunkedAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    std::unique_ptr<Allocator> allocator(new LockedAllocator(
-        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
+    std::shared_ptr<Allocator> allocator(new LockedAllocator(
+        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
 
     if (retry_time_ > 0) {
       auto* retry_allocator =
@@ -119,6 +122,10 @@ class ChunkedAllocator : public Allocator {
       allocator.reset(retry_allocator);
     }
 
+    if (FLAGS_enable_buffered_allocator) {
+      allocator.reset(new MultiBinBufferedAllocator(allocator));
+    }
+
     return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
   }
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2..d87dd9a4b6df288065389a335a9ddb4047dd096a 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::Free(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(bf_allocation,
                           "The input allocation is not BestFitAllocation.");
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 4f10f2b53e8543d4197097f1cae8de765bceeb0f..c137438c0c35a575d366a1dfdf950262f711defa 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index fc75abc9dfee6c9df5bc87faa493002cc1fe6298..e04c0aa34b1cd6200806cc2a012161e3478eca0b 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
+BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
     : underlying_allocator_(std::move(allocator)) {
   PADDLE_ENFORCE_NOT_NULL(
       underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must be unmanaged");
+      "Underlying allocator of BufferedAllocator must not be null");
   if (underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
   while (!allocations_.empty()) {  // free the largest
     auto it = --allocations_.end();
     cur += it->second->size();
-    delete it->second.release();
+    underlying_allocator_->Free(it->second.release());
     allocations_.erase(it);
     if (cur >= size) return;
   }
 }
 
-bool BufferedAllocator::IsAllocThreadSafe() const {
-  return this->underlying_allocator_->IsAllocThreadSafe();
-}
-void BufferedAllocator::Free(Allocation *allocation) {
+bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
+
+void BufferedAllocator::FreeImpl(Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   allocations_.emplace(allocation->size(), AllocationPtr(allocation));
 }
+
 Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
@@ -61,17 +61,15 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
     if (it != allocations_.end() && it->first < size * 2) {
       AllocationPtr result(std::move(it->second));
       allocations_.erase(it);
-      return new AllocationWithUnderlying(std::move(result));
+      return result.release();
     }
   }
 
   try {
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   } catch (BadAlloc &) {
     FreeCache(size);
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index d44a3f85beba712b1e735ba14008689bce7d0d64..c728395705842d29a7b2a8441a7048a7e4bf5e6b 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -31,7 +31,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
+  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
 
   ~BufferedAllocator();
 
@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::multimap<size_t, AllocationPtr> allocations_;
   std::unique_ptr<std::mutex> mtx_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 41ebb9dbeaf36eafe3dff4ae294b84427f660cbf..7b2138cf34ce1dc5358d4494a519b07b09608f6c 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -64,7 +64,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void Free(Allocation *allocation) override {
+  void FreeImpl(Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(alloc);
     if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index cc81a6f7b8b1950b07b6fb1571b53d9b5ddb1b9f..0fb2e6e1496fcfed74121f0fae5ab87744f75aa2 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -25,7 +25,7 @@ CPUAllocation::CPUAllocation(void *ptr, size_t size)
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::Free(Allocation *allocation) {
+void CPUAllocator::FreeImpl(Allocation *allocation) {
   PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
   free(allocation->ptr());
   delete allocation;
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 26d3643f4edff1f2d71b1c761e915a6dacb485ad..9e0c2551860954f4681be6e1a223aa61ae3a4df0 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -43,7 +43,7 @@ class CPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 430bf0be98e08787ac4412a8b6e0fcc310ffe2b4..2e7c4ee78f4dcc51391233cca8ef896784550da1 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,13 +23,14 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::Free(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(Allocation* allocation) {
   platform::CUDADeviceGuard guard(place_.device);
   auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
   PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
                     place_);
   PADDLE_ENFORCE(cudaFree(allocation->ptr()));
+  VLOG(2) << "cudaFree is called";
   delete allocation;
 }
 Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 63726f5820b1c81565117c7a9bf798c17c9681f6..962f9a7c028a77ccb451fa06d295110c46a8cdc4 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -35,7 +35,7 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 1936f9d4cd83c53cf7b322ab29a3e0d92e042abc..9c71c0bbcef3762591516691c18f231c89938453 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -336,7 +336,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   return new Allocation(ptr, size, place_);
 }
 
-void LegacyAllocator::Free(Allocation *allocation) {
+void LegacyAllocator::FreeImpl(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index d9bdae153da6439598f76f5cac226897e6e0c596..27cd42ea35012f07ae7db79c46d767138ddaafff 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
 
  protected:
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 835f6527c8a1d83340167bd9079f7cee25ad24cf..03a17814e1a6f5285cec7f12e6840bc4f08d9a27 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -23,26 +23,24 @@ namespace allocation {
 bool LockedAllocator::IsAllocThreadSafe() const { return true; }
 
 LockedAllocator::LockedAllocator(
-    std::unique_ptr<Allocator> &&underlying_allocator)
+    std::shared_ptr<Allocator> underlying_allocator)
     : underlying_allocator_(std::move(underlying_allocator)) {
   PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
   if (!underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
 }
-void LockedAllocator::Free(Allocation *allocation) {
-  {
-    platform::LockGuardPtr<std::mutex> guard(mtx_);
-    reinterpret_cast<AllocationWithUnderlying *>(allocation)
-        ->allocation_.reset();  // Destroy inner allocation
-  }
-  delete allocation;
+
+void LockedAllocator::FreeImpl(Allocation *allocation) {
+  platform::LockGuardPtr<std::mutex> guard(mtx_);
+  underlying_allocator_->Free(allocation);
 }
+
 Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return new AllocationWithUnderlying(
-      underlying_allocator_->Allocate(size, attr));
+  return underlying_allocator_->Allocate(size, attr).release();
 }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 4967b9bb8d3ad101cff4657b0a45b49b76e2deb2..b735ccef101417b3f880eb6dcdd9964cffbe875c 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -24,15 +24,15 @@ namespace allocation {
 // A allocator to make underlying allocator thread safe.
 class LockedAllocator : public Allocator {
  public:
-  explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
+  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44240121f055dbbcb9246b00ee5d5ae38d7c1a55
--- /dev/null
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
+#include <algorithm>
+#include <limits>
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+
+DEFINE_double(tolerant_times, 2,
+              "Tolerant memory size times of buffered_allocator");
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+static void CheckAndModifyMemoryDivisionPlan(
+    std::vector<size_t> *division_plan) {
+  // Check whether the division plan is strictly sorted
+  bool is_strictly_sorted = true;
+  for (size_t i = 1; i < division_plan->size(); ++i) {
+    if ((*division_plan)[i - 1] >= (*division_plan)[i]) {
+      is_strictly_sorted = false;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted");
+
+  // Insert 0 and remove MAX to disivion plan for clean binary searching code
+  if (division_plan->empty() || division_plan->front() != 0) {
+    division_plan->insert(division_plan->begin(), 0);
+  }
+
+  constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max();
+  if (division_plan->back() == kSizeTypeMax) {
+    division_plan->pop_back();
+  }
+
+  PADDLE_ENFORCE(division_plan->size() >= 1, "Division plan cannot be empty");
+}
+
+static std::vector<size_t> GetDefaultDivisionPlan() {
+  std::vector<size_t> plan;
+  for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
+    plan.push_back(static_cast<size_t>(1) << i);
+  }
+  return plan;
+}
+
+inline static size_t FindDivisionPlanBinIndex(const std::vector<size_t> &bins,
+                                              size_t size) {
+  return static_cast<size_t>(std::upper_bound(bins.begin(), bins.end(), size) -
+                             bins.begin() - 1);
+}
+
+inline static size_t TolerantUpperSize(size_t size) {
+  return static_cast<size_t>(size * FLAGS_tolerant_times);
+}
+
+MultiBinBufferedAllocator::MultiBinBufferedAllocator(
+    std::shared_ptr<Allocator> underlying_allocator)
+    : MultiBinBufferedAllocator(std::move(underlying_allocator),
+                                GetDefaultDivisionPlan()) {}
+
+MultiBinBufferedAllocator::MultiBinBufferedAllocator(
+    std::shared_ptr<Allocator> underlying_allocator,
+    const std::vector<size_t> &division_plan)
+    : underlying_allocator_(std::move(underlying_allocator)),
+      division_plan_(division_plan) {
+  CheckAndModifyMemoryDivisionPlan(&division_plan_);
+  allocations_.resize(division_plan_.size());
+  mtx_.resize(division_plan_.size());
+  if (underlying_allocator_->IsAllocThreadSafe()) {
+    for (auto &mtx : mtx_) {
+      mtx.reset(new std::mutex());
+    }
+  }
+
+  VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times;
+}
+
+void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
+  auto bin_index = FindDivisionPlanBinIndex(division_plan_, allocation->size());
+  {
+    platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
+    allocations_[bin_index].emplace(allocation->size(),
+                                    AllocationPtr(allocation));
+  }
+}
+
+void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
+  size_t accumulated_size = 0;
+  // FIXME(zjl): free the largest first when there is no extra
+  for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
+    platform::LockGuardPtr<std::mutex> lock(mtx_[i]);
+    if (allocations_[i].empty()) continue;
+    auto it = --allocations_[i].end();
+    do {
+      accumulated_size += it->second->size();
+      underlying_allocator_->Free(it->second.release());
+      allocations_[i].erase(it--);
+      if (accumulated_size >= size) {
+        return;
+      }
+    } while (!allocations_[i].empty());
+  }
+}
+
+Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
+  auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
+  auto upper_size = TolerantUpperSize(size);
+
+  for (; upper_size >= division_plan_[bin_index]; ++bin_index) {
+    auto &allocation = allocations_[bin_index];
+    platform::LockGuardPtr<std::mutex> lock(mtx_[bin_index]);
+    auto it = allocation.lower_bound(size);
+    if (it != allocation.end() && it->second->size() < upper_size) {
+      auto ret = std::move(it->second);
+      allocation.erase(it);
+      return ret.release();
+    }
+  }
+
+  try {
+    return underlying_allocator_->Allocate(size, attr).release();
+  } catch (BadAlloc &) {
+    VLOG(2) << "BadAlloc raises, try to free " << size << " caches";
+    FreeCache(size, bin_index);
+    return underlying_allocator_->Allocate(size, attr).release();
+  }
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2437ff7e35832e80eaf15b5a4a6cd2c83cfe537
--- /dev/null
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class MultiBinBufferedAllocator : public Allocator {
+ public:
+  explicit MultiBinBufferedAllocator(
+      std::shared_ptr<Allocator> underlying_allocator);
+
+  MultiBinBufferedAllocator(std::shared_ptr<Allocator> underlying_allocator,
+                            const std::vector<size_t>& division_plan);
+
+  bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; }
+
+  void ClearCache() { FreeCache(static_cast<size_t>(-1), 0); }
+
+ protected:
+  Allocation* AllocateImpl(size_t size, Attr attr) override;
+  void FreeImpl(Allocation* allocation) override;
+
+ private:
+  void FreeCache(size_t size, size_t bin_index);
+
+  std::shared_ptr<Allocator> underlying_allocator_;
+  std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
+  std::vector<size_t> division_plan_;
+  std::vector<std::unique_ptr<std::mutex>> mtx_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22787a8512338c9e2045f3e482171656811dd0eb
--- /dev/null
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+inline std::shared_ptr<MultiBinBufferedAllocator> GetBufferedAllocator(
+    Allocation *allocation, bool thread_safe) {
+  std::shared_ptr<Allocator> allocator(new BestFitAllocator(allocation));
+  if (thread_safe) {
+    allocator.reset(new LockedAllocator(std::move(allocator)));
+  }
+
+  return std::make_shared<MultiBinBufferedAllocator>(allocator);
+}
+
+TEST(buffered_allocator, thread_safety) {
+  std::unique_ptr<CPUAllocator> allocator(new CPUAllocator());
+  auto chunk = allocator->Allocate(1 << 20, allocator->kDefault);
+  {
+    auto buf_allocator = GetBufferedAllocator(chunk.get(), true);
+    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true);
+  }
+
+  {
+    auto buf_allocator = GetBufferedAllocator(chunk.get(), false);
+    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), false);
+  }
+}
+
+class StubAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
+class StubAllocator : public Allocator {
+ public:
+  void ResetCounter() {
+    construct_count_ = 0;
+    destruct_count_ = 0;
+  }
+
+  size_t GetAllocCount() const { return construct_count_; }
+
+  size_t GetFreeCount() const { return destruct_count_; }
+
+ protected:
+  void FreeImpl(Allocation *allocation) override {
+    auto *alloc = dynamic_cast<StubAllocation *>(allocation);
+    PADDLE_ENFORCE_NOT_NULL(alloc);
+    if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
+    ++destruct_count_;
+    delete allocation;
+  }
+
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override {
+    ++construct_count_;
+    if (size == 0) {
+      return new StubAllocation(nullptr, 0, platform::CPUPlace());
+    } else {
+      return new StubAllocation(new uint8_t[size], size, platform::CPUPlace());
+    }
+  }
+
+ private:
+  size_t construct_count_ = 0;
+  size_t destruct_count_ = 0;
+};
+
+constexpr size_t kZero = 0;
+constexpr size_t kOne = 1;
+constexpr size_t kTwo = 2;
+
+TEST(buffered_allocator, lazy_free) {
+  std::vector<int> original_alloc_size({1022, 1023, 1024, 1025, 1026});
+  for (auto alloc_size : original_alloc_size) {
+    auto stub_allocator = std::make_shared<StubAllocator>();
+    auto *underlying_allocator = stub_allocator.get();
+    auto allocator =
+        std::make_shared<MultiBinBufferedAllocator>(stub_allocator);
+
+    {
+      underlying_allocator->ResetCounter();
+      auto x = allocator->Allocate(alloc_size, allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+      x = nullptr;
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    {
+      underlying_allocator->ResetCounter();
+      auto x = allocator->Allocate(900, allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+      auto y = allocator->Allocate(2048, allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+      x = nullptr;
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+      y = nullptr;
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    {
+      underlying_allocator->ResetCounter();
+      allocator->ClearCache();
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
+    }
+  }
+}
+
+TEST(buffered_allocator, garbage_collection) {
+  std::unique_ptr<CPUAllocator> cpu_allocator(new CPUAllocator());
+  auto chunk = cpu_allocator->Allocate(2048, cpu_allocator->kDefault);
+  auto allocator = GetBufferedAllocator(chunk.get(), false);
+  auto x1 = allocator->Allocate(1600, allocator->kDefault);
+  auto x2 = allocator->Allocate(400, allocator->kDefault);
+  x1 = nullptr;
+  x2 = nullptr;
+  auto x3 = allocator->Allocate(1600, allocator->kDefault);
+  ASSERT_NE(x3, nullptr);
+  ASSERT_NE(x3->ptr(), nullptr);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index de81d12cca6ca280289371abdec225c9e2b8f4d0..dfc52edf9c8b3539c565d00e291c121b62c2e22a 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::Free(Allocation *allocation) {
+void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
   PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
   PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
   delete allocation;
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 42d0938f2afbb1efca8bfdd7035bc0eada30f06b..3acb1f0c5ae35261667296280356efdc4f4b7649 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -31,7 +31,7 @@ class CPUPinnedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 981705051b449e6a35c2dcce9138dc2efae52920..7e888988f9602e362d73f64c1b45552e84e3349c 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -18,25 +18,15 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-bool RetryAllocator::IsAllocThreadSafe() const {
-  return underlying_allocator_->IsAllocThreadSafe();
-}
-
-void RetryAllocator::Free(Allocation* allocation) {
+void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
-  reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
-  {
-    // notify all waited allocators, they can try to allocate memory after free.
-    std::lock_guard<std::mutex> lock(mutex_);
-    cv_.notify_all();
-  }
-  delete allocation;
+  underlying_allocator_->Free(allocation);
+  cv_.notify_all();
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto alloc_func = [&, this]() {
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   };
   // In fact, we can unify the code of allocation success and failure
   // But it would add lock even when allocation success at the first time
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 5efcac8b108002a2a2da920173d237096de4fffa..70b9c2ba1d6f91525fa87d8f1d442da58b641f2f 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -24,32 +24,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-class RetryAllocator;
-
 class RetryAllocator : public Allocator {
  public:
-  RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
+  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
       : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
-    EnforceCheck();
-  }
-
-  bool IsAllocThreadSafe() const override;
-
- private:
-  void EnforceCheck() {
     PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_.get(),
-        "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
+        underlying_allocator_,
+        "UnderlyingAllocator of RetryAllocator must not be null");
     PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
                    "UnderlyingAllocator of RetryAllocator must be thread-safe");
   }
 
+  bool IsAllocThreadSafe() const override { return true; }
+
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::chrono::milliseconds retry_time_;
   std::mutex mutex_;
   std::condition_variable cv_;
@@ -57,8 +50,6 @@ class RetryAllocator : public Allocator {
   // For debug, We can add an atomic integer to record how many memory sizes are
   // waited to allocate
   // std::atomic<size_t> waited_allocate_size_{0};
-
-  friend class RetryAllocation;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
index cb2df1a029815478bbc9d3b09425f3ef145c5fb3..a0211b6d83281316f8f669a6039cb3ed3ed1e464 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -22,6 +22,14 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
   return underlying_allocator_->IsAllocThreadSafe();
 }
 
+void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
+  if (dynamic_cast<ZeroSizeAllocation *>(allocation)) {
+    delete allocation;
+  } else {
+    underlying_allocator_->Free(allocation);
+  }
+}
+
 Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   if (size == 0) {
     return new ZeroSizeAllocation(place_);
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index 6b80245a34e7a6834aa75a90218845cc92036881..e6081798364272e41b13a9914bc92160dbda70bf 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -39,6 +39,7 @@ class ZeroSizeAllocator : public Allocator {
 
  protected:
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
+  void FreeImpl(Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 9cbdfe46e78dc84e58eae6929c887221d9562c69..6cb4ec1da5e324aece08234ea56704bb5df001ff 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -29,38 +29,31 @@ namespace paddle {
 namespace platform {
 namespace alloc = memory::allocation;
 
-TemporaryAllocation::TemporaryAllocation(
-    alloc::AllocationPtr &&underlying_allocation)
-    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
-                 underlying_allocation->place()),
-      underlying_allocation_(std::move(underlying_allocation)) {}
-
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
     t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
     wait_delete_mem_ = 0;
   }
 
+  alloc::AllocationDeleter deleter;
   for (auto tmp : *t_allocations) {
     VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
              << " size: " << tmp.second->size();
-    delete tmp.second;
+    deleter(tmp.second);
   }
 }
 
-void TemporaryAllocator::Free(alloc::Allocation *allocation) {
-  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
   if (platform::is_gpu_place(temp_allocation->place())) {
     PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
                    "The place should be the same.");
@@ -84,7 +77,6 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
   }
   VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
            << " size: " << temp_allocation->size();
-  delete temp_allocation;
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
@@ -119,11 +111,9 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
   }
   // If not find the the available allocation, get allocation from
   // AllocatorFacadeInstance.
-  auto raw_allocation =
-      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
-  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
   VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
-  return temp_mem;
+  return temp_mem.release();
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index d657a14223326aa1e2cb5b154a10a56ae742f95c..cead316ed94135f66eeb81d3bf911986660d3f43 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -22,14 +22,6 @@
 namespace paddle {
 namespace platform {
 
-class TemporaryAllocation : public memory::allocation::Allocation {
- public:
-  explicit TemporaryAllocation(
-      memory::allocation::AllocationPtr &&underlying_allocation);
-
-  memory::allocation::AllocationPtr underlying_allocation_;
-};
-
 /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
  * which used by CUDA's async operation.
  *
@@ -56,7 +48,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   void SetCallback(const std::function<void()> &callback);
 
  protected:
-  void Free(memory::allocation::Allocation *allocation) override;
+  void FreeImpl(memory::allocation::Allocation *allocation) override;
 
   memory::allocation::Allocation *AllocateImpl(
       size_t size, memory::allocation::Allocator::Attr attr) override;
@@ -65,8 +57,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   platform::Place place_;
   // When the allocation is not held by any variable, it should be placed
   // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
-      nullptr};
+  std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
+      temp_mem_map_{nullptr};
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cf59ff6d3b97a4be5d87f1185acc6173b5d501b2..5820d87ce2f3a4a6dbc3a49bf499fb3735920bed 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -308,6 +308,7 @@ PYBIND11_MODULE(core, m) {
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_clear", &Tensor::clear)
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8102732c55be2e9922875a7f7b29d68aba1f4900..2fa6b79caf129862e501842b0f7b86926423d3a1 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -129,6 +129,7 @@ def __bootstrap__():
         'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
         'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
         'fast_eager_deletion_mode', 'allocator_strategy',
+        'enable_buffered_allocator', 'tolerant_times',
         'reader_queue_speed_test_mode', 'print_sub_graph_dir',
         'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism',
         'enable_parallel_graph', 'multiple_of_cupti_buffer_size',