diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 2f69b5c0c8619b09453dbb66cb40f0a18ba204d8..bb4253e0ed2fed17dd8fb0fb874ddd9196a5013b 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,6 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
+cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 1eb1d3c7e8d402e1d889d7ec0f1ed82acae64758..89ce628c5d51bda7e819a3a8e9ebdb3822a26f53 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -34,11 +34,23 @@ BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator>&& allocator,
   InitAndEnforceCheck(std::move(allocator), division_plan);
 }
 
-BufferedAllocator::~BufferedAllocator() {
+BufferedAllocator::~BufferedAllocator() { FlushImpl(); }
+
+void BufferedAllocator::FlushImpl() {
   for (auto& v : allocations_) {
     for (auto& pair : v) {
       underlying_allocator_->FreeUniquePtr(std::move(pair.second));
     }
+    v.clear();
+  }
+}
+
+void BufferedAllocator::Flush() {
+  if (mtx_) {
+    std::lock_guard<std::mutex> lock(*mtx_);
+    FlushImpl();
+  } else {
+    FlushImpl();
   }
 }
 
@@ -77,8 +89,7 @@ void BufferedAllocator::InsertAllocationImpl(
     std::unique_ptr<Allocation>&& allocation) {
   auto size = allocation->size();
   auto idx = GetListIndex(size);
-  allocations_[idx].insert(std::pair<size_t, std::unique_ptr<Allocation>>(
-      size, std::move(allocation)));
+  allocations_[idx].emplace(size, std::move(allocation));
 }
 
 void BufferedAllocator::InsertAllocation(
@@ -91,9 +102,8 @@ void BufferedAllocator::InsertAllocation(
   }
 }
 
-bool BufferedAllocator::Match(const std::unique_ptr<Allocation>& allocation,
-                              size_t size) {
-  return (allocation->size() >> 1) <= size;
+bool BufferedAllocator::Match(size_t actual_size, size_t requested_size) {
+  return (actual_size >> 1) < requested_size;
 }
 
 size_t BufferedAllocator::GetListIndex(size_t size) {
@@ -108,11 +118,28 @@ std::unique_ptr<Allocation> BufferedAllocator::RemoveAllocationImpl(
   auto& allocation_map = allocations_[idx];
   auto it = allocation_map.lower_bound(size);
   // Only remove allocation whose size is not more than twice of requested size
-  if (it != allocation_map.end() && Match(it->second, size)) {
-    auto ret = std::move(it->second);
-    allocation_map.erase(it);
-    return ret;
+  if (it != allocation_map.end()) {
+    if (Match(it->second->size(), size)) {
+      auto ret = std::move(it->second);
+      allocation_map.erase(it);
+      return ret;
+    } else {
+      return nullptr;
+    }
   } else {
+    while (++idx < allocations_.size() && Match(division_plan_[idx], size)) {
+      auto& allocation_map = allocations_[idx];
+      if (!allocation_map.empty()) {
+        auto it = allocation_map.begin();
+        if (Match(it->second->size(), size)) {
+          auto ret = std::move(it->second);
+          allocation_map.erase(it);
+          return ret;
+        } else {
+          return nullptr;
+        }
+      }
+    }
     return nullptr;
   }
 }
@@ -171,6 +198,10 @@ void BufferedAllocator::FreeUniquePtr(std::unique_ptr<Allocation> allocation) {
 
 bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
 
+const std::vector<size_t>& BufferedAllocator::GetDivisionPlan() const {
+  return division_plan_;
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index 630b3ad800d90711fe4fdee7650efdd828aac45e..0fe6e5a19a84995a9d143f4c3803ff54b77a1f92 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -37,12 +37,17 @@ class BufferedAllocator : public UnmanagedAllocator {
 
   ~BufferedAllocator();
 
-  std::unique_ptr<Allocation> Allocate(size_t size, Allocator::Attr) override;
+  std::unique_ptr<Allocation> Allocate(
+      size_t size, Allocator::Attr attr = Allocator::Attr::kDefault) override;
 
   void FreeUniquePtr(std::unique_ptr<Allocation> allocation) override;
 
   bool IsAllocThreadSafe() const override;
 
+  const std::vector<size_t>& GetDivisionPlan() const;
+
+  void Flush();
+
  private:
   void InitAndEnforceCheck(std::unique_ptr<Allocator>&& allocator,
                            const std::vector<size_t>& division_plan);
@@ -50,13 +55,15 @@ class BufferedAllocator : public UnmanagedAllocator {
   void InsertAllocation(std::unique_ptr<Allocation>&& allocation);
   void InsertAllocationImpl(std::unique_ptr<Allocation>&& allocation);
 
-  static bool Match(const std::unique_ptr<Allocation>& allocation, size_t size);
+  static bool Match(size_t actual_size, size_t requested_size);
   std::unique_ptr<Allocation> RemoveAllocation(size_t size);
   std::unique_ptr<Allocation> RemoveAllocationImpl(size_t size);
 
   void FreeAllocations(size_t size);
   void FreeAllocationsImpl(size_t size);
 
+  void FlushImpl();
+
   size_t GetListIndex(size_t size);
 
   std::unique_ptr<UnmanagedAllocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9fb4f3926c7b9fab50abb9c6b9f25ddd44b093f
--- /dev/null
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/buffered_allocator.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
+    Allocation *allocation, bool thread_safe) {
+  std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
+  if (thread_safe) {
+    allocator.reset(new LockedAllocator(std::move(allocator)));
+  }
+
+  return std::unique_ptr<BufferedAllocator>(
+      new BufferedAllocator(std::move(allocator)));
+}
+
+TEST(buffered_allocator, thread_safety) {
+  std::unique_ptr<CPUAllocator> allocator(new CPUAllocator());
+  auto chunk = allocator->Allocate(1 << 20);
+  {
+    auto buf_allocator = GetBufferedAllocator(chunk.get(), true);
+    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true);
+  }
+
+  {
+    auto buf_allocator = GetBufferedAllocator(chunk.get(), false);
+    ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), false);
+  }
+
+  allocator->FreeUniquePtr(std::move(chunk));
+}
+
+class StubAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
+class StubAllocator : public UnmanagedAllocator {
+ public:
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Allocator::Attr attr) override {
+    ++construct_count_;
+    if (size == 0) {
+      return std::unique_ptr<Allocation>(
+          new StubAllocation(nullptr, 0, platform::CPUPlace()));
+    } else {
+      return std::unique_ptr<Allocation>(
+          new StubAllocation(new uint8_t[size], size, platform::CPUPlace()));
+    }
+  }
+
+  void FreeUniquePtr(std::unique_ptr<Allocation> allocation) {
+    StubAllocation *alloc = dynamic_cast<StubAllocation *>(allocation.get());
+    PADDLE_ENFORCE_NOT_NULL(alloc);
+    if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
+    ++destruct_count_;
+  }
+
+  void ResetCounter() {
+    construct_count_ = 0;
+    destruct_count_ = 0;
+  }
+
+  size_t GetAllocCount() const { return construct_count_; }
+
+  size_t GetFreeCount() const { return destruct_count_; }
+
+ private:
+  size_t construct_count_ = 0;
+  size_t destruct_count_ = 0;
+};
+
+constexpr size_t kZero = 0;
+constexpr size_t kOne = 1;
+constexpr size_t kTwo = 2;
+
+TEST(buffered_allocator, lazy_free) {
+  std::unique_ptr<StubAllocator> stub_allocator(new StubAllocator());
+  auto *underlying_allocator = stub_allocator.get();
+  std::unique_ptr<BufferedAllocator> allocator(
+      new BufferedAllocator(std::move(stub_allocator)));
+
+  {
+    underlying_allocator->ResetCounter();
+    auto x = allocator->Allocate(1025);
+    ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    allocator->FreeUniquePtr(std::move(x));
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+  }
+
+  {
+    underlying_allocator->ResetCounter();
+    auto x = allocator->Allocate(900);
+    ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    auto y = allocator->Allocate(2048);
+    ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    allocator->FreeUniquePtr(std::move(x));
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    allocator->FreeUniquePtr(std::move(y));
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+  }
+
+  {
+    underlying_allocator->ResetCounter();
+    allocator->Flush();
+    ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
+  }
+}
+
+TEST(buffered_allocator, garbage_collection) {
+  std::unique_ptr<CPUAllocator> cpu_allocator(new CPUAllocator());
+  auto chunk = cpu_allocator->Allocate(2048);
+  auto allocator = GetBufferedAllocator(chunk.get(), false);
+  auto x1 = allocator->Allocate(1600);
+  auto x2 = allocator->Allocate(400);
+  allocator->FreeUniquePtr(std::move(x1));
+  allocator->FreeUniquePtr(std::move(x2));
+  auto x3 = allocator->Allocate(1600);
+  ASSERT_NE(x3, nullptr);
+  ASSERT_NE(x3->ptr(), nullptr);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle