From 953214ad9759e10e066fceb71512e44983023fab Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 19 Mar 2019 12:32:02 +0000
Subject: [PATCH] add more unittest modify allocator strategy remove changes of
 legacy buddy_allocator test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |   2 +
 .../{inlined_stack.h => inlined_vector.h}     |  29 ++--
 paddle/fluid/framework/inlined_vector_test.cc |  53 ++++++
 paddle/fluid/memory/allocation/CMakeLists.txt |  37 ++---
 .../memory/allocation/aligned_allocator.h     |   1 +
 paddle/fluid/memory/allocation/allocator.cc   |   8 +-
 paddle/fluid/memory/allocation/allocator.h    |  26 ++-
 .../memory/allocation/allocator_facade.cc     |  55 +++++--
 .../memory/allocation/allocator_strategy.cc   |   6 +-
 ...r.cc => auto_growth_best_fit_allocator.cc} |  10 +-
 ...tor.h => auto_growth_best_fit_allocator.h} |   4 +-
 ...o_growth_best_fit_allocator_facade_test.cc |  96 +++++++++++
 ...=> auto_growth_best_fit_allocator_test.cc} |  10 +-
 .../allocation/buffered_allocator_test.cc     |   1 +
 .../memory/allocation/legacy_allocator.cc     |  38 ++---
 .../memory/allocation/locked_allocator.cc     |   2 +
 .../multi_bin_buffered_allocator.cc           | 154 +++++++++++-------
 .../allocation/multi_bin_buffered_allocator.h |  14 +-
 .../multi_bin_buffered_allocator_test.cc      |  24 ++-
 .../naive_best_fit_allocator_facade_test.cc   |  94 +++++++++++
 .../fluid/memory/allocation/retry_allocator.h |   1 +
 ...ti_bin_buffered_allocator_division_plan.cc |  56 +++++++
 .../memory/allocation/zero_size_allocator.cc  |  17 +-
 .../memory/allocation/zero_size_allocator.h   |   7 +-
 paddle/fluid/memory/detail/buddy_allocator.cc |  75 ++++-----
 paddle/fluid/memory/detail/buddy_allocator.h  |  11 +-
 paddle/fluid/memory/detail/memory_block.h     |   9 +-
 paddle/fluid/platform/gpu_info.cc             |  59 +------
 paddle/fluid/platform/gpu_info.h              |   6 -
 paddle/fluid/platform/temporary_allocator.cc  |   1 +
 paddle/fluid/platform/temporary_allocator.h   |   1 +
 paddle/fluid/pybind/pybind.cc                 |   4 +
 paddle/fluid/string/printf.h                  |   6 +-
 python/paddle/fluid/__init__.py               |   4 +-
 34 files changed, 615 insertions(+), 306 deletions(-)
 rename paddle/fluid/framework/{inlined_stack.h => inlined_vector.h} (71%)
 create mode 100644 paddle/fluid/framework/inlined_vector_test.cc
 rename paddle/fluid/memory/allocation/{auto_increment_best_fit_allocator.cc => auto_growth_best_fit_allocator.cc} (92%)
 rename paddle/fluid/memory/allocation/{auto_increment_best_fit_allocator.h => auto_growth_best_fit_allocator.h} (96%)
 create mode 100644 paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
 rename paddle/fluid/memory/allocation/{auto_increment_best_fit_allocator_test.cc => auto_growth_best_fit_allocator_test.cc} (85%)
 create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
 create mode 100644 paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index ad19d729eb..265a5c6fe2 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(tuple_test SRCS tuple_test.cc )
 
+cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
+
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
diff --git a/paddle/fluid/framework/inlined_stack.h b/paddle/fluid/framework/inlined_vector.h
similarity index 71%
rename from paddle/fluid/framework/inlined_stack.h
rename to paddle/fluid/framework/inlined_vector.h
index 1083c9f77c..0adff9d212 100644
--- a/paddle/fluid/framework/inlined_stack.h
+++ b/paddle/fluid/framework/inlined_vector.h
@@ -14,18 +14,18 @@
 
 #pragma once
 
-#include <deque>
+#include <vector>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
 template <typename T, size_t N>
-class InlinedStack {
+class InlinedVector {
   static_assert(N > 0, "N must be larger than 0");
 
  public:
-  inline void push(const T& item) {
+  inline void push_back(const T& item) {
     if (size_ < N) {
       head_[size_] = item;
     } else {
@@ -34,21 +34,21 @@ class InlinedStack {
     ++size_;
   }
 
-  inline void pop() {
-    PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack.");
+  inline void pop_back() {
+    PADDLE_ENFORCE(!empty(), "Try to pop back element from empty vector.");
     if (size_ > N) {
       tail_.pop_back();
     }
     --size_;
   }
 
-  inline const T& top() const {
-    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+  inline const T& back() const {
+    PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
     return size_ <= N ? head_[size_ - 1] : tail_.back();
   }
 
-  inline T& top() {
-    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+  inline T& back() {
+    PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
     return size_ <= N ? head_[size_ - 1] : tail_.back();
   }
 
@@ -63,10 +63,19 @@ class InlinedStack {
     return i < N ? head_[i] : tail_[i - N];
   }
 
+  operator std::vector<T>() const {
+    std::vector<T> ret;
+    ret.reserve(size_);
+    for (size_t i = 0; i < size_; ++i) {
+      ret.emplace_back((*this)[i]);
+    }
+    return ret;
+  }
+
  private:
   T head_[N];
   size_t size_{0};
-  std::deque<T> tail_;
+  std::vector<T> tail_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc
new file mode 100644
index 0000000000..b2b7a95b5e
--- /dev/null
+++ b/paddle/fluid/framework/inlined_vector_test.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/inlined_vector.h"
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+TEST(inlined_stack, inlined_stack) {
+  size_t max_num = 10;
+
+  InlinedVector<size_t, 5> stack;
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack.size(), i);
+    stack.push_back(i);
+    ASSERT_EQ(stack.size(), i + 1);
+  }
+
+  std::vector<size_t> vec = stack;
+
+  ASSERT_EQ(stack.size(), vec.size());
+
+  for (size_t i = 0; i < vec.size(); ++i) {
+    ASSERT_EQ(stack[i], vec[i]);
+  }
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack[i], i);
+  }
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack.back(), max_num - 1 - i);
+    stack.pop_back();
+    ASSERT_EQ(stack.size(), max_num - 1 - i);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 26ae89fe28..7552eee77e 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator)
+cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator gflags)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
 
-cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator)
-cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator)
+cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator)
+cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
+
+if (NOT WIN32)
+  cc_test(test_multi_bin_buffered_allocator_division_plan SRCS test_multi_bin_buffered_allocator_division_plan.cc DEPS multi_bin_buffered_allocator)
+endif()
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
@@ -42,30 +47,20 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator multi_bin_buffered_allocator auto_growth_best_fit_allocator legacy_allocator zero_size_allocator)
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS
-        ${AllocatorFacadeDeps}
-        cpu_allocator
-        locked_allocator
-        best_fit_allocator
-        aligned_allocator
-        auto_increment_allocator
-        zero_size_allocator
-        conditional_allocator
-        retry_allocator
-        buffered_allocator
-        multi_bin_buffered_allocator
-        auto_increment_best_fit_allocator
-        allocator_strategy
-        legacy_allocator
-        )
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 
 cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
+
+cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
+
+cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 602d85bf9e..b536d4276e 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 15a7227300..5a5253d911 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
 
 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
   auto ptr = AllocateImpl(size, attr);
-  ptr->RegisterAllocatorChain(this);
+  ptr->RegisterDecoratedAllocator(this);
   return AllocationPtr(ptr);
 }
 
 void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopDecoratedAllocator();
   allocator->Free(allocation);
 }
 
 void Allocator::Free(Allocation* allocation) {
-  allocation->PopAllocator();
+  allocation->PopDecoratedAllocator();
   FreeImpl(allocation);
 }
 
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
 
 void AllocationDeleter::operator()(Allocation* allocation) const {
-  Allocator* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopDecoratedAllocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index fabd1ff57f..3497e46516 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -15,8 +15,9 @@
 #pragma once
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
-#include "paddle/fluid/framework/inlined_stack.h"
+#include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -78,29 +79,26 @@ class Allocation {
 
   virtual ~Allocation();
 
-  // This function should only be used in unittest
-  std::vector<Allocator*> GetAllocatorChain() const {
-    std::vector<Allocator*> allocators;
-    for (size_t i = 0; i < allocator_chain_.size(); ++i) {
-      allocators.push_back(allocator_chain_[i]);
-    }
-    return allocators;
+ private:
+  std::vector<Allocator*> DecoratedAllocators() const {
+    return static_cast<std::vector<Allocator*>>(decorated_allocators_);
   }
 
- private:
-  inline void RegisterAllocatorChain(Allocator* allocator) {
-    allocator_chain_.push(allocator);
+  inline void RegisterDecoratedAllocator(Allocator* allocator) {
+    decorated_allocators_.push_back(allocator);
   }
 
-  inline void PopAllocator() { allocator_chain_.pop(); }
+  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
 
-  inline Allocator* TopAllocator() { return allocator_chain_.top(); }
+  inline Allocator* TopDecoratedAllocator() {
+    return decorated_allocators_.back();
+  }
 
  private:
   void* ptr_;
   size_t size_;
   platform::Place place_;
-  framework::InlinedStack<Allocator*, 8> allocator_chain_;
+  framework::InlinedVector<Allocator*, 8> decorated_allocators_;
 
   friend class Allocator;
   friend class AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index b35032fb3c..0f7d5926f1 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -17,12 +17,13 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/conditional_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -32,6 +33,7 @@
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -51,6 +53,21 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+static inline std::shared_ptr<Allocator> WrapRetryAndBufferedAllocator(
+    std::shared_ptr<Allocator> allocator, int64_t retry_time,
+    bool enable_buffered) {
+  if (retry_time > 0) {
+    auto* retry_allocator =
+        new RetryAllocator(std::move(allocator), retry_time);
+    allocator.reset(retry_allocator);
+  }
+
+  if (enable_buffered) {
+    allocator.reset(new MultiBinBufferedAllocator(allocator));
+  }
+  return allocator;
+}
+
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
  public:
@@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator {
     std::shared_ptr<Allocator> allocator(new LockedAllocator(
         std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    if (retry_time_ > 0) {
-      auto* retry_allocator =
-          new RetryAllocator(std::move(allocator), retry_time_);
-      allocator.reset(retry_allocator);
-    }
+    allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_,
+                                              FLAGS_enable_buffered_allocator);
 
-    if (FLAGS_enable_buffered_allocator) {
-      allocator.reset(new MultiBinBufferedAllocator(allocator));
-    }
-
-    return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
+    return std::make_shared<AlignedAllocator<4096>>(std::move(allocator));
   }
 
   bool IsAllocThreadSafe() const override { return true; }
@@ -210,7 +220,7 @@ class AllocatorFacadePrivate {
         break;
       }
       case AllocatorStrategy::kAutoGrowthBestFit: {
-        InitCPUAllocator();
+        InitAutoGrowthCPUAllocator();
         InitAutoGrowthCUDAAllocator();
         InitAutoGrowthCUDAPinnedAllocator();
         WrapZeroSizeAllocator();
@@ -224,15 +234,25 @@ class AllocatorFacadePrivate {
   }
 
  private:
+  void InitAutoGrowthCPUAllocator() {
+    auto cpu_allocator = std::make_shared<AlignedAllocator<4096>>(
+        std::make_shared<CPUAllocator>());
+    allocators_[platform::CPUPlace()] =
+        std::make_shared<AutoGrowthBestFitAllocator>(
+            cpu_allocator, platform::CpuMaxChunkSize(), 4096);
+  }
+
   void InitAutoGrowthCUDAAllocator() {
 #ifdef PADDLE_WITH_CUDA
     int dev_cnt = platform::GetCUDADeviceCount();
     for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
       auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
           std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
-      allocators_[platform::CUDAPlace(dev_id)] =
-          std::make_shared<AutoIncrementBestFitAllocator>(
-              cuda_allocator, platform::GpuMaxChunkSize(), 4096);
+      auto allocator = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMaxChunkSize(), 4096);
+
+      allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator(
+          allocator, FLAGS_gpu_allocator_retry_time, false);
     }
 #endif
   }
@@ -242,7 +262,7 @@ class AllocatorFacadePrivate {
     auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
         std::make_shared<CPUPinnedAllocator>());
     allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<AutoIncrementBestFitAllocator>(
+        std::make_shared<AutoGrowthBestFitAllocator>(
             cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
 #endif
   }
@@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
-                                     AllocationDeleter());
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index d96fe0851d..e2a9c8414a 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,7 +19,9 @@
 DEFINE_string(
     allocator_strategy, "legacy",
     "The allocation strategy. Legacy means the original allocator of Fluid."
-    "New means the experimental allocators of Fluid. in [legacy, new]");
+    "naive_best_fit means the experimental best fit allocator. "
+    "auto_growth_best_fit means the experimental auto growth best fit "
+    "allocator. Enum in [legacy, naive_best_fit, auto_growth_best_fit].");
 
 namespace paddle {
 namespace memory {
@@ -28,7 +30,7 @@ namespace allocation {
 static AllocatorStrategy GetStrategyFromFlag() {
   if (FLAGS_allocator_strategy == "legacy") {
     return AllocatorStrategy::kLegacy;
-  } else if (FLAGS_allocator_strategy == "navie_best_fit") {
+  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
     return AllocatorStrategy::kNaiveBestFit;
   } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
     return AllocatorStrategy::kAutoGrowthBestFit;
diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
similarity index 92%
rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index ee52b10aa6..3d901e04d0 100644
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include <algorithm>
 #include <list>
 #include <map>
@@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) {
   return remaining == 0 ? size : size + alignment - remaining;
 }
 
-AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator(
+AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
     size_t alignment)
     : underlying_allocator_(underlying_allocator),
       chunk_size_(align(chunk_size, alignment)),
       alignment_(alignment) {}
 
-Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
-                                                        Attr attr) {
-  if (size == 0) return nullptr;
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size, Attr attr) {
   size = align(size, alignment_);
   std::lock_guard<std::mutex> guard(mtx_);
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
   return new Chunk::BlockAllocation(block_it);
 }
 
-void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) {
+void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
   auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
 
diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
similarity index 96%
rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 6e569c2627..f60dad8112 100644
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -25,9 +25,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-class AutoIncrementBestFitAllocator : public Allocator {
+class AutoGrowthBestFitAllocator : public Allocator {
  public:
-  explicit AutoIncrementBestFitAllocator(
+  explicit AutoGrowthBestFitAllocator(
       const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
       size_t alignment);
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
new file mode 100644
index 0000000000..8b8fb5d938
--- /dev/null
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+static inline size_t AlignTo(size_t size, size_t alignment = 4096) {
+  auto remaining = size % alignment;
+  return remaining == 0 ? size : size + alignment - remaining;
+}
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  FLAGS_allocator_strategy = "auto_growth_best_fit";
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), AlignTo(size));
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), AlignTo(size));
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), AlignTo(size));
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), AlignTo(size));
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
similarity index 85%
rename from paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
rename to paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index c5fb209279..087eb8c9cc 100644
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -22,18 +22,18 @@
 
 #include <iostream>
 
-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
-TEST(allocator, auto_increment_best_fit_allocator) {
+TEST(allocator, auto_growth_best_fit_allocator) {
   auto cpu_allocator = std::make_shared<CPUAllocator>();
 
   auto allocator =
-      std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096);
+      std::make_shared<AutoGrowthBestFitAllocator>(cpu_allocator, 0, 4096);
 
   std::mutex mtx;
   std::condition_variable cv;
@@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) {
   }
   cv.notify_all();
 
-  thread_main();
-
   for (auto &th : ths) {
     th.join();
   }
-
-  std::cout << "test ends" << std::endl;
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 7b2138cf34..854a117b0e 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 1c42994bec..0fd68b2a22 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false,
             "that initializing the allocated memory with a small value "
             "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(initial_gpu_memory_in_mb);
-DECLARE_double(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);
 
 namespace paddle {
@@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() {
   std::call_once(init_flag, []() {
     a = new detail::BuddyAllocator(
         std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
-        platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
   });
 
   return a;
@@ -147,28 +144,16 @@ class GPUBuddyAllocatorList {
     PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
     std::call_once(flags_[dev_id], [this, dev_id] {
       platform::SetDeviceId(dev_id);
-      size_t first_size = platform::GpuFirstAllocateChunkSize();
-      size_t re_size = platform::GpuReAllocateChunkSize();
-      allocators_[dev_id] =
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::GPUAllocator(dev_id)),
-                             platform::GpuMinChunkSize(), first_size, re_size);
-      VLOG(2) << "\n\nNOTE: each GPU device use "
-              << string::HumanReadableSize(first_size) << "(initial chunk) "
-              << string::HumanReadableSize(re_size) << "(reallocate chunk) "
-              << "% of GPU memory.\n"
-              << "You can set GFlags environment variable '"
-              << "FLAGS_fraction_of_gpu_memory_to_use"
-              << "' or "
-                 "'FLAGS_initial_gpu_memory_in_mb/"
-                 "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
-                 "of GPU usage.\n\n";
-      VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
-              << FLAGS_fraction_of_gpu_memory_to_use << ", "
-              << "FLAGS_initial_gpu_memory_in_mb="
-              << FLAGS_initial_gpu_memory_in_mb << ", "
-              << "FLAGS_reallocate_gpu_memory_in_mb="
-              << FLAGS_reallocate_gpu_memory_in_mb;
+      allocators_[dev_id] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::GPUAllocator(dev_id)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     });
     return allocators_[dev_id];
   }
@@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
     ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
-                            platform::CUDAPinnedMaxChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
   });
 
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 03a17814e1..c43099cc88 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -14,8 +14,10 @@
 
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include <mutex>  // NOLINT
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
index 3acb17e4a0..c649a7161e 100644
--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
@@ -17,20 +17,37 @@
 #include <cctype>
 #include <fstream>
 #include <limits>
+#include <mutex>  // NOLINT
 #include <sstream>
 #include <string>
+#include <utility>
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 
-DEFINE_double(buffered_allocator_excess_times, 2,
-              "Tolerant memory size times of buffered_allocator");
+DEFINE_double(
+    buffered_allocator_excess_times, 2,
+    "Excess memory size times of buffered_allocator. BufferedAllocator"
+    " would try to reuse memory freed previously, but the size of freed"
+    " allocation may not be exactly the same as the requested. Here, we"
+    " use a flag to control the excess times of reused memory size. "
+    "Not quite sure what is the best excess times value.");
 
-DEFINE_string(division_plan_path, "", "Division plan file path");
+DEFINE_string(
+    buffered_allocator_division_plan_path, "",
+    "The file path which "
+    "determines the memory size division plans of BufferedAllocator."
+    "If it is empty, use the default division plan. The file must be a "
+    "text file which each lines indicates the bound of division plan. "
+    "For example, if the text file has 3 lines, which are '500M', '1G', "
+    " '2G', the division plan would be [0, 500M), [500M, 1G), [1G, 2G) "
+    "and [2G, +inf). Allocation request whose requested memory size is "
+    "inside the last interval of division plan would be dispatched to "
+    " underlying_allocator directly without caching when freed.");
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
-std::string TrimStringAndToLowerCase(const std::string &str) {
+static std::string TrimStringAndToUpperCase(const std::string &str) {
   auto not_space = [](char ch) { return std::isspace(ch) == 0; };
   auto first_idx = static_cast<size_t>(
       std::find_if(str.begin(), str.end(), not_space) - str.begin());
@@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) {
       std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
   if (first_idx == str.size() || last_idx == str.size()) return "";
 
-  last_idx = str.size() - 1 - last_idx;
+  last_idx = str.size() - last_idx;
   auto ret = str.substr(first_idx, last_idx - first_idx);
   std::for_each(ret.begin(), ret.end(),
-                [](char &ch) { ch = std::tolower(ch); });
+                [](char &ch) { ch = std::toupper(ch); });
   return ret;
 }
 
-static size_t ParseStringToBytes(const std::string &str) {
-  std::string ret = str;
-  if (ret.back() == 'b') {
-    ret.pop_back();
+namespace {
+
+enum DivisionPlanFileStatus { kEOF, kException, kNormal };
+
+}  // NOLINT
+
+static size_t ParseStringToBytes(const std::string &original_str,
+                                 DivisionPlanFileStatus *ret_code) {
+  std::string str = TrimStringAndToUpperCase(original_str);
+
+  if (str.empty()) {
+    *ret_code = kEOF;
+    return 0;
+  }
+
+  if (str.back() == 'B') {
+    str.pop_back();
+    if (str.empty()) {
+      *ret_code = kException;
+      return 0;
+    }
   }
 
-  PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
   size_t multiples = 1;
-  switch (ret.back()) {
-    case 'g':
+  switch (str.back()) {
+    case 'G':
       multiples *= (static_cast<size_t>(1) << 30);
       break;
-    case 'm':
+    case 'M':
       multiples *= (static_cast<size_t>(1) << 20);
       break;
-    case 'k':
+    case 'K':
       multiples *= (static_cast<size_t>(1) << 10);
       break;
     default:
       break;
   }
 
-  if (multiples != 1) ret.pop_back();
-  ret = TrimStringAndToLowerCase(ret);
-  double ret_val = 0.0;
-  std::stringstream ss(ret);
-  PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
-  return static_cast<size_t>(ret_val * multiples);
+  if (multiples != 1) {
+    str.pop_back();
+    if (str.empty()) {
+      *ret_code = kException;
+      return 0;
+    }
+  }
+
+  str = TrimStringAndToUpperCase(str);
+  double mem_val = -1.0;
+  std::stringstream ss(str);
+  if (!(ss >> mem_val) || mem_val < 0) {
+    *ret_code = kException;
+    return 0;
+  }
+
+  *ret_code = kNormal;
+  return static_cast<size_t>(mem_val * multiples);
 }
 
 static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
@@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
   return ret + "]";
 }
 
-static std::vector<size_t> ReadDivisionPlanFromFile(
+std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
     const std::string &filepath) {
   std::ifstream is(filepath.c_str());
-  PADDLE_ENFORCE(is.good(), "File not exist");
+  PADDLE_ENFORCE(is.good(), "File %s not exist", filepath);
   std::string str;
   std::vector<size_t> plan;
+  size_t line_num = 1;
   while (std::getline(is, str).good()) {
-    str = TrimStringAndToLowerCase(str);
-    if (str.empty()) break;
-    plan.push_back(ParseStringToBytes(str));
+    DivisionPlanFileStatus status;
+    size_t ret = ParseStringToBytes(str, &status);
+    if (status == kEOF) {
+      break;
+    }
+    if (status == kException) {
+      PADDLE_THROW(
+          "Invalid format in line %d of file %s: '%s'. Only support B, KB, MB, "
+          "GB.",
+          line_num, filepath, str);
+    }
+    plan.push_back(ret);
+    ++line_num;
   }
   return plan;
 }
@@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan(
   }
   PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted");
 
-  // Insert 0 and remove MAX to disivion plan for clean binary searching code
+  // Insert 0 to disivion plan for clean binary searching code
   if (division_plan->empty() || division_plan->front() != 0) {
     division_plan->insert(division_plan->begin(), 0);
   }
 
+  // Remove MAX from disivion plan for clean binary searching code
   constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max();
   if (division_plan->back() == kSizeTypeMax) {
     division_plan->pop_back();
@@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan(
 }
 
 static std::vector<size_t> GetDefaultDivisionPlan() {
-  if (!FLAGS_division_plan_path.empty()) {
-    return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
+  if (!FLAGS_buffered_allocator_division_plan_path.empty()) {
+    return ReadBufferedAllocatorDivisionPlanFromFile(
+        FLAGS_buffered_allocator_division_plan_path);
   }
 
+  // Default division plan is 4K, 8K, 16K, ..., 500M, 1G
   constexpr size_t kMaxLogSize = 30;
-
   std::vector<size_t> plan;
   for (size_t i = 12; i <= kMaxLogSize; ++i) {
     plan.push_back(static_cast<size_t>(1) << i);
   }
-  /*
-  for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
-    plan.push_back(static_cast<size_t>(1) << i);
-  }
-  */
   return plan;
 }
 
@@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
       division_plan_(division_plan) {
   CheckAndModifyMemoryDivisionPlan(&division_plan_);
   allocations_.resize(division_plan_.size() - 1);
+  accumulated_cache_size_.assign(division_plan_.size() - 1, 0UL);
   mtx_.resize(division_plan_.size() - 1);
   if (underlying_allocator_->IsAllocThreadSafe()) {
     for (auto &mtx : mtx_) {
@@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
     platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
     allocations_[bin_index].emplace(allocation->size(),
                                     AllocationPtr(allocation));
+    accumulated_cache_size_[bin_index] += allocation->size();
   } else {
     underlying_allocator_->Free(allocation);
   }
 }
 
-// bin_index is not used currently.
 // Maybe we can design more flexible FreeCache strategy based on bin_index
-size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
+// and require size.
+size_t MultiBinBufferedAllocator::ClearCache() {
   size_t accumulated_size = 0;
   // FIXME(zjl): free the largest first when there is no extra
   for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
     platform::LockGuardPtr<std::mutex> lock(mtx_[i]);
-    if (allocations_[i].empty()) continue;
-    auto it = --allocations_[i].end();
-    do {
-      accumulated_size += it->second->size();
-      underlying_allocator_->Free(it->second.release());
-      allocations_[i].erase(it--);
-      if (accumulated_size >= size) {
-        return accumulated_size;
-      }
-    } while (!allocations_[i].empty());
+    allocations_[i].clear();
+    accumulated_size += accumulated_cache_size_[i];
+    accumulated_cache_size_[i] = 0;
   }
   return accumulated_size;
 }
@@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
   auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
   auto upper_size = TolerantUpperSize(size);
 
-  // if (bin_index >= allocations_.size()) {
-  //  VLOG(2) << "Allocate " << size << " from underlying directly";
-  //}
-
   for (; bin_index < allocations_.size() &&
          upper_size >= division_plan_[bin_index];
        ++bin_index) {
@@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
       size_t sz = it->second->size();
       auto ret = std::move(it->second);
       allocation.erase(it);
+      accumulated_cache_size_[bin_index] -= sz;
       VLOG(3) << "Allocate " << sz << "(required " << size
               << ") from cache directly";
       return ret.release();
@@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
       VLOG(2) << "Allocate " << size << " from underlying directly";
       return ret;
     } catch (BadAlloc &) {
-      VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
-              << " bytes caches";
-      // size_t actual_free_size = FreeCache(size, bin_index);
-      size_t actual_free_size = FreeCache(-1UL, bin_index);
+      size_t actual_free_size = ClearCache();
       VLOG(1) << retry_time << "-th free " << actual_free_size
               << " bytes caches";
       if (actual_free_size == 0) throw;
@@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
   }
 }
 
+void UseMultiBinBufferedAllocatorGFlags() {}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
index f550f76e50..b93f4c062b 100644
--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
@@ -16,6 +16,8 @@
 
 #include <map>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <string>
 #include <vector>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -24,6 +26,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
+    const std::string& filepath);
+
 class MultiBinBufferedAllocator : public Allocator {
  public:
   explicit MultiBinBufferedAllocator(
@@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator {
 
   bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; }
 
-  void ClearCache() { FreeCache(static_cast<size_t>(-1), 0); }
+  size_t ClearCache();
+
+  const std::vector<size_t>& DivisionPlan() const { return division_plan_; }
 
  protected:
   Allocation* AllocateImpl(size_t size, Attr attr) override;
   void FreeImpl(Allocation* allocation) override;
 
  private:
-  size_t FreeCache(size_t size, size_t bin_index);
-
   std::shared_ptr<Allocator> underlying_allocator_;
   std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
+  std::vector<size_t> accumulated_cache_size_;
   std::vector<size_t> division_plan_;
   std::vector<std::unique_ptr<std::mutex>> mtx_;
 };
 
+extern void UseMultiBinBufferedAllocatorGFlags();
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
index 22787a8512..be5dfba644 100644
--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) {
 
     {
       underlying_allocator->ResetCounter();
-      allocator->ClearCache();
+      size_t cache_size = allocator->ClearCache();
+      ASSERT_EQ(cache_size, static_cast<size_t>(alloc_size + 2048));
       ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
       ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
     }
+
+    {
+      underlying_allocator->ResetCounter();
+      auto p = allocator->Allocate(allocator->DivisionPlan().back(),
+                                   allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kOne);
+
+    {
+      underlying_allocator->ResetCounter();
+      auto p = allocator->Allocate(allocator->DivisionPlan().back() - 1,
+                                   allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
new file mode 100644
index 0000000000..6952c19092
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+DECLARE_bool(enable_buffered_allocator);
+
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  FLAGS_allocator_strategy = "naive_best_fit";
+  FLAGS_enable_buffered_allocator = true;
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 70b9c2ba1d..379f576d6e 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -18,6 +18,7 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc b/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc
new file mode 100644
index 0000000000..15daa8413f
--- /dev/null
+++ b/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
+
+DECLARE_string(buffered_allocator_division_plan_path);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(buffered_allocator, division_plan) {
+  std::string path = "/tmp/buffered_allocator_divison_plan";
+  FLAGS_buffered_allocator_division_plan_path = path;
+
+  {
+    std::vector<std::string> plan(
+        {"100b", "300.7K", "500.3m", "1.02gB", "2g", "4G"});
+
+    std::ofstream os(path);
+    for (auto &p : plan) {
+      os << p << std::endl;
+    }
+    os.close();
+  }
+
+  auto plan = ReadBufferedAllocatorDivisionPlanFromFile(
+      FLAGS_buffered_allocator_division_plan_path);
+  ASSERT_EQ(plan.size(), 6UL);
+  ASSERT_EQ(plan[0], 100UL);
+  ASSERT_EQ(plan[1], static_cast<size_t>(300.7 * 1024));
+  ASSERT_EQ(plan[2], static_cast<size_t>(500.3 * 1024 * 1024));
+  ASSERT_EQ(plan[3], static_cast<size_t>(1.02 * 1024 * 1024 * 1024));
+  ASSERT_EQ(plan[4], static_cast<size_t>(2.0 * 1024 * 1024 * 1024));
+  ASSERT_EQ(plan[5], static_cast<size_t>(4.0 * 1024 * 1024 * 1024));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
index a0211b6d83..39743bcb10 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
   return underlying_allocator_->IsAllocThreadSafe();
 }
 
-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (dynamic_cast<ZeroSizeAllocation *>(allocation)) {
-    delete allocation;
+Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
+  if (size == 0) {
+    return new Allocation(nullptr, 0, place_);
   } else {
-    underlying_allocator_->Free(allocation);
+    return underlying_allocator_->Allocate(size, attr).release();
   }
 }
 
-Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  if (size == 0) {
-    return new ZeroSizeAllocation(place_);
+void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
+  if (allocation->size() == 0) {
+    delete allocation;
   } else {
-    return underlying_allocator_->Allocate(size, attr).release();
+    underlying_allocator_->Free(allocation);
   }
 }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index e608179836..08a7a06dbf 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
 
@@ -23,12 +24,6 @@ namespace allocation {
 // The allocator handles the request's size is zero. Allocator will always
 // return an allocation even the request size is zero. However, the
 // allocation.ptr() is nullptr
-class ZeroSizeAllocation : public Allocation {
- public:
-  explicit ZeroSizeAllocation(const platform::Place& p)
-      : Allocation(nullptr, 0, p) {}
-};
-
 class ZeroSizeAllocator : public Allocator {
  public:
   ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 80d32ba564..26ef27c3ca 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,11 +25,9 @@ namespace detail {
 
 BuddyAllocator::BuddyAllocator(
     std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
-      first_allocate_chunk_size_(first_allocate_chunk_size),
-      reallocate_chunk_size_(reallocate_chunk_size),
-      max_chunk_size_(first_allocate_chunk_size),
+      max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
       system_allocator_(std::move(system_allocator)) {}
 
@@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() {
               "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    auto desc = cache_.load(block);
-    VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";
+    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
 
-    system_allocator_->Free(block, desc.size, desc.index);
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
     pool_.erase(pool_.begin());
   }
@@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
     VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size, false);
+    return SystemAlloc(size);
   }
 
   // query and allocate from the existing chunk
@@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // refill the pool if failure
   if (it == pool_.end()) {
     it = RefillPool();
-    // if still failure, try to allocate from SystemAllocator
+    // if still failure, fail fatally
     if (it == pool_.end()) {
-      return SystemAlloc(size, false);
+      return nullptr;
     }
   } else {
     VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
@@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) {
 
   VLOG(10) << "Free from address " << block;
 
-  if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
     VLOG(10) << "Free directly from system allocator";
     system_allocator_->Free(block, block->total_size(cache_),
                             block->index(cache_));
@@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) {
 
 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
-size_t BuddyAllocator::GetMaxChunkSize() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return max_chunk_size_;
-}
+size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
 
-void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
+void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(&index, size);
 
@@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
 
   if (p == nullptr) return nullptr;
 
-  static_cast<MemoryBlock*>(p)->init(
-      &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
-                          : MemoryBlock::UNMANAGED_HUGE_CHUNK,
-      index, size, nullptr, nullptr);
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
 
   return static_cast<MemoryBlock*>(p)->data();
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-  if (total_used_ + total_free_ > 0) {
-    max_chunk_size_ = reallocate_chunk_size_;
+#ifdef PADDLE_WITH_CUDA
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
   }
+#endif
 
   // Allocate a new maximum sized block
   size_t index = 0;
-  size_t chunk_size = max_chunk_size_;
-  void* p = system_allocator_->Alloc(&index, chunk_size);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
 
   if (p == nullptr) return pool_.end();
 
@@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
            << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     chunk_size, nullptr, nullptr);
+                                     max_chunk_size_, nullptr, nullptr);
 
   // gpu fallback allocation
   if (system_allocator_->UseGpu() &&
@@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
     fallback_alloc_count_++;
   }
 
-  total_free_ += chunk_size;
+  total_free_ += max_chunk_size_;
 
   // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 
 void BuddyAllocator::CleanIdleFallBackAlloc() {
   // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;
+  if (!fallback_alloc_count_) return;
 
   for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
     MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
-    auto desc = cache_.load(block);
-    if (desc.index == 0) {
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
       return;
     }
 
     VLOG(10) << "Return block " << block << " to fallback allocator.";
 
-    system_allocator_->Free(block, desc.size, block->index(cache_));
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= desc.size;
+    total_free_ -= max_chunk_size_;
     fallback_alloc_count_--;
 
     // If no fall allocation exists, return directly
@@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
   if (!shall_free_alloc()) return;
 
   for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-    auto desc = cache_.load(block);
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
 
-    if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
-      return;
-    }
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
     VLOG(10) << "Return block " << block << " to base allocator.";
 
-    system_allocator_->Free(block, desc.size, desc.index);
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= desc.size;
+    total_free_ -= max_chunk_size_;
 
     if (!shall_free_alloc()) return;
   }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 88d6f736a8..3f86a51f0d 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -34,8 +34,7 @@ namespace detail {
 class BuddyAllocator {
  public:
   BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
-                 size_t min_chunk_size, size_t first_allocate_chunk_size,
-                 size_t reallocate_chunk_size);
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -58,7 +57,7 @@ class BuddyAllocator {
   using PoolSet = std::set<IndexSizeAddress>;
 
   /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size, bool is_managed = true);
+  void* SystemAlloc(size_t size);
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
@@ -88,11 +87,7 @@ class BuddyAllocator {
   size_t total_free_ = 0;  // the total size of free memory
 
   size_t min_chunk_size_;  // the minimum size of each chunk
-
-  size_t first_allocate_chunk_size_;
-  size_t reallocate_chunk_size_;
-
-  size_t max_chunk_size_;
+  size_t max_chunk_size_;  // the maximum size of each chunk
 
  private:
   /**
diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
index 5e5ff5b849..5cceba659b 100644
--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -27,11 +27,10 @@ class MetadataCache;
 // MemoryBlock::Desc and the payload.
 struct MemoryBlock {
   enum Type {
-    FREE_CHUNK,            // memory is free and idle
-    ARENA_CHUNK,           // memory is being occupied
-    MANAGED_HUGE_CHUNK,    // memory is huge and out of management
-    UNMANAGED_HUGE_CHUNK,  // memory is huge and managed by allocator
-    INVALID_CHUNK          // memory is invalid
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
   };
 
   // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 9553298d5e..400a6d7bfa 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
-DEFINE_double(
-    initial_gpu_memory_in_mb, -1.0,
-    "GPU memory chunk size in MB."
-    "Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
-    "chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
-    "chunk when the first chunk is not enough. This flag has higher priority "
-    "than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0.");
-
-DEFINE_double(reallocate_gpu_memory_in_mb, -1.0,
-              "GPU memory chunk size in MB."
-              "If FLAGS_initial_gpu_memory_in_mb is set and "
-              "FLAGS_reallocate_gpu_memory_in_mb "
-              "is less than 0, it would be replaced by "
-              "FLAGS_initial_gpu_memory_in_mb. Disable "
-              "when FLAGS_initial_gpu_memory_in_mb is less than 0.");
-
 DEFINE_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() {
 
   size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                           (total - reserving));
+
   PADDLE_ENFORCE_LE(allocating, available,
                     "Insufficient GPU memory to allocation.");
 
   return allocating;
 }
 
-size_t GpuFirstAllocateChunkSize() {
-  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
-    return GpuMaxChunkSize();
-  }
-
-  size_t total = 0;
-  size_t available = 0;
-
-  GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-
-  size_t initial_mem =
-      static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb * (1 << 20));
-  PADDLE_ENFORCE_LE(initial_mem, available,
-                    "Insufficient GPU memory to allocation.");
-  return initial_mem;
-}
-
-size_t GpuReAllocateChunkSize() {
-  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
-    return GpuMaxChunkSize();
-  }
-
-  double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb;
-  if (reallocate_mem < 0) {
-    PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0,
-                   "FLAGS_init_gpu_memory_to_use_mb must be larger than 0");
-    reallocate_mem = FLAGS_initial_gpu_memory_in_mb;
-  }
-
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-  size_t realloc_mem = static_cast<size_t>(reallocate_mem * (1 << 20));
-  PADDLE_ENFORCE_LE(realloc_mem, available,
-                    "Insufficient GPU memory to allocation.");
-  return realloc_mem;
-}
-
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 7c05658851..1e1ab2503f 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -66,12 +66,6 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
 
-//! Get init chunk size for GPU buddy allocator.
-size_t GpuFirstAllocateChunkSize();
-
-//! Get reallocate chunk size for GPU buddy allocator.
-size_t GpuReAllocateChunkSize();
-
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 4e1056cfb9..ddde7baf4c 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/temporary_allocator.h"
+#include <memory>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 DEFINE_int64(limit_of_tmp_allocation, -1,
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index cead316ed9..912d45eaf1 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -16,6 +16,7 @@
 #include <condition_variable>  // NOLINT
 #include <deque>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 9e6b89f745..6f2e41c159 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -39,6 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) {
   paddle::platform::CpuTotalPhysicalMemory();
 
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
+
+  paddle::memory::allocation::UseMultiBinBufferedAllocatorGFlags();
+
   m.doc() = "C++ core of PaddlePaddle";
 
   // using framework in this function. Since it is inside a function, it will
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 16bb3771f2..66b768665b 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
-template <typename T>
-std::string HumanReadableSize(T size) {
+inline std::string HumanReadableSize(double f_size) {
   size_t i = 0;
-  double f_size = static_cast<double>(size);
   double orig = f_size;
   const std::vector<std::string> units(
       {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size > 1024) {
+  while (f_size >= 1024) {
     f_size /= 1024;
     i++;
   }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 83003fc68b..ad2ce30ab5 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -130,7 +130,8 @@ def __bootstrap__():
         'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
         'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
         'allocator_strategy', 'enable_buffered_allocator',
-        'buffered_allocator_excess_times', 'reader_queue_speed_test_mode',
+        'buffered_allocator_excess_times',
+        'buffered_allocator_division_plan_path', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
         'inner_op_parallelism', 'enable_parallel_graph',
         'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
@@ -163,7 +164,6 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb',
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
-- 
GitLab