add more unittest

modify allocator strategy remove changes of legacy buddy_allocator test=develop

add more unittest
modify allocator strategy remove changes of legacy buddy_allocator test=develop
953214ad · sneaxiy · fd23262e · 953214ad · 953214ad · 953214ad
34 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)

 cc_test(tuple_test SRCS tuple_test.cc )

+cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
+
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)

--- a/paddle/fluid/framework/inlined_stack.h
+++ b/paddle/fluid/framework/inlined_stack.h
@@ -14,18 +14,18 @@

 #pragma once

-#include <deque>
+#include <vector>
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace framework {

 template <typename T, size_t N>
-class InlinedStack {
+class InlinedVector {
  static_assert(N > 0, "N must be larger than 0");

 public:
-  inline void push(const T& item) {
+  inline void push_back(const T& item) {
    if (size_ < N) {
      head_[size_] = item;
    } else {
@@ -34,21 +34,21 @@ class InlinedStack {
    ++size_;
  }

-  inline void pop() {
-    PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack.");
+  inline void pop_back() {
+    PADDLE_ENFORCE(!empty(), "Try to pop back element from empty vector.");
    if (size_ > N) {
      tail_.pop_back();
    }
    --size_;
  }

-  inline const T& top() const {
-    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+  inline const T& back() const {
+    PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
    return size_ <= N ? head_[size_ - 1] : tail_.back();
  }

-  inline T& top() {
-    PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
+  inline T& back() {
+    PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
    return size_ <= N ? head_[size_ - 1] : tail_.back();
  }

@@ -63,10 +63,19 @@ class InlinedStack {
    return i < N ? head_[i] : tail_[i - N];
  }

+  operator std::vector<T>() const {
+    std::vector<T> ret;
+    ret.reserve(size_);
+    for (size_t i = 0; i < size_; ++i) {
+      ret.emplace_back((*this)[i]);
+    }
+    return ret;
+  }
+
 private:
  T head_[N];
  size_t size_{0};
-  std::deque<T> tail_;
+  std::vector<T> tail_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/inlined_vector_test.cc
+++ b/paddle/fluid/framework/inlined_vector_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/inlined_vector.h"
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+TEST(inlined_stack, inlined_stack) {
+  size_t max_num = 10;
+
+  InlinedVector<size_t, 5> stack;
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack.size(), i);
+    stack.push_back(i);
+    ASSERT_EQ(stack.size(), i + 1);
+  }
+
+  std::vector<size_t> vec = stack;
+
+  ASSERT_EQ(stack.size(), vec.size());
+
+  for (size_t i = 0; i < vec.size(); ++i) {
+    ASSERT_EQ(stack[i], vec[i]);
+  }
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack[i], i);
+  }
+
+  for (size_t i = 0; i < max_num; ++i) {
+    ASSERT_EQ(stack.back(), max_num - 1 - i);
+    stack.pop_back();
+    ASSERT_EQ(stack.size(), max_num - 1 - i);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator)
+cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator gflags)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)

-cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator)
-cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator)
+cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator)
+cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
+
+if (NOT WIN32)
+  cc_test(test_multi_bin_buffered_allocator_division_plan SRCS test_multi_bin_buffered_allocator_division_plan.cc DEPS multi_bin_buffered_allocator)
+endif()

 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
@@ -42,30 +47,20 @@ else ()
    set(AllocatorFacadeDeps)
 endif()

+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator multi_bin_buffered_allocator auto_growth_best_fit_allocator legacy_allocator zero_size_allocator)
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS
-        ${AllocatorFacadeDeps}
-        cpu_allocator
-        locked_allocator
-        best_fit_allocator
-        aligned_allocator
-        auto_increment_allocator
-        zero_size_allocator
-        conditional_allocator
-        retry_allocator
-        buffered_allocator
-        multi_bin_buffered_allocator
-        auto_increment_best_fit_allocator
-        allocator_strategy
-        legacy_allocator
-        )
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)

 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)

 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)

 cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
+
+cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
+
+cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -14,6 +14,7 @@

 #pragma once
 #include <memory>
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"

 namespace paddle {

--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; }

 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
  auto ptr = AllocateImpl(size, attr);
-  ptr->RegisterAllocatorChain(this);
+  ptr->RegisterDecoratedAllocator(this);
  return AllocationPtr(ptr);
 }

 void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopDecoratedAllocator();
  allocator->Free(allocation);
 }

 void Allocator::Free(Allocation* allocation) {
-  allocation->PopAllocator();
+  allocation->PopDecoratedAllocator();
  FreeImpl(allocation);
 }

 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }

 void AllocationDeleter::operator()(Allocation* allocation) const {
-  Allocator* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopDecoratedAllocator();
  allocator->Free(allocation);
 }


--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -15,8 +15,9 @@
 #pragma once
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
-#include "paddle/fluid/framework/inlined_stack.h"
+#include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@@ -78,29 +79,26 @@ class Allocation {

  virtual ~Allocation();

-  // This function should only be used in unittest
-  std::vector<Allocator*> GetAllocatorChain() const {
-    std::vector<Allocator*> allocators;
-    for (size_t i = 0; i < allocator_chain_.size(); ++i) {
-      allocators.push_back(allocator_chain_[i]);
-    }
-    return allocators;
+ private:
+  std::vector<Allocator*> DecoratedAllocators() const {
+    return static_cast<std::vector<Allocator*>>(decorated_allocators_);
  }

- private:
-  inline void RegisterAllocatorChain(Allocator* allocator) {
-    allocator_chain_.push(allocator);
+  inline void RegisterDecoratedAllocator(Allocator* allocator) {
+    decorated_allocators_.push_back(allocator);
  }

-  inline void PopAllocator() { allocator_chain_.pop(); }
+  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }

-  inline Allocator* TopAllocator() { return allocator_chain_.top(); }
+  inline Allocator* TopDecoratedAllocator() {
+    return decorated_allocators_.back();
+  }

 private:
  void* ptr_;
  size_t size_;
  platform::Place place_;
-  framework::InlinedStack<Allocator*, 8> allocator_chain_;
+  framework::InlinedVector<Allocator*, 8> decorated_allocators_;

  friend class Allocator;
  friend class AllocationDeleter;

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -17,12 +17,13 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/conditional_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -32,6 +33,7 @@
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -51,6 +53,21 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+static inline std::shared_ptr<Allocator> WrapRetryAndBufferedAllocator(
+    std::shared_ptr<Allocator> allocator, int64_t retry_time,
+    bool enable_buffered) {
+  if (retry_time > 0) {
+    auto* retry_allocator =
+        new RetryAllocator(std::move(allocator), retry_time);
+    allocator.reset(retry_allocator);
+  }
+
+  if (enable_buffered) {
+    allocator.reset(new MultiBinBufferedAllocator(allocator));
+  }
+  return allocator;
+}
+
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
 public:
@@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator {
    std::shared_ptr<Allocator> allocator(new LockedAllocator(
        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));

-    if (retry_time_ > 0) {
-      auto* retry_allocator =
-          new RetryAllocator(std::move(allocator), retry_time_);
-      allocator.reset(retry_allocator);
-    }
+    allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_,
+                                              FLAGS_enable_buffered_allocator);

-    if (FLAGS_enable_buffered_allocator) {
-      allocator.reset(new MultiBinBufferedAllocator(allocator));
-    }
-
-    return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
+    return std::make_shared<AlignedAllocator<4096>>(std::move(allocator));
  }

  bool IsAllocThreadSafe() const override { return true; }
@@ -210,7 +220,7 @@ class AllocatorFacadePrivate {
        break;
      }
      case AllocatorStrategy::kAutoGrowthBestFit: {
-        InitCPUAllocator();
+        InitAutoGrowthCPUAllocator();
        InitAutoGrowthCUDAAllocator();
        InitAutoGrowthCUDAPinnedAllocator();
        WrapZeroSizeAllocator();
@@ -224,15 +234,25 @@ class AllocatorFacadePrivate {
  }

 private:
+  void InitAutoGrowthCPUAllocator() {
+    auto cpu_allocator = std::make_shared<AlignedAllocator<4096>>(
+        std::make_shared<CPUAllocator>());
+    allocators_[platform::CPUPlace()] =
+        std::make_shared<AutoGrowthBestFitAllocator>(
+            cpu_allocator, platform::CpuMaxChunkSize(), 4096);
+  }
+
  void InitAutoGrowthCUDAAllocator() {
 #ifdef PADDLE_WITH_CUDA
    int dev_cnt = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
      auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
          std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
-      allocators_[platform::CUDAPlace(dev_id)] =
-          std::make_shared<AutoIncrementBestFitAllocator>(
-              cuda_allocator, platform::GpuMaxChunkSize(), 4096);
+      auto allocator = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMaxChunkSize(), 4096);
+
+      allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator(
+          allocator, FLAGS_gpu_allocator_retry_time, false);
    }
 #endif
  }
@@ -242,7 +262,7 @@ class AllocatorFacadePrivate {
    auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
        std::make_shared<CPUPinnedAllocator>());
    allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<AutoIncrementBestFitAllocator>(
+        std::make_shared<AutoGrowthBestFitAllocator>(
            cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
 #endif
  }
@@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() {

 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
-                                     AllocationDeleter());
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
 }

 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,

--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,7 +19,9 @@
 DEFINE_string(
    allocator_strategy, "legacy",
    "The allocation strategy. Legacy means the original allocator of Fluid."
-    "New means the experimental allocators of Fluid. in [legacy, new]");
+    "naive_best_fit means the experimental best fit allocator. "
+    "auto_growth_best_fit means the experimental auto growth best fit "
+    "allocator. Enum in [legacy, naive_best_fit, auto_growth_best_fit].");

 namespace paddle {
 namespace memory {
@@ -28,7 +30,7 @@ namespace allocation {
 static AllocatorStrategy GetStrategyFromFlag() {
  if (FLAGS_allocator_strategy == "legacy") {
    return AllocatorStrategy::kLegacy;
-  } else if (FLAGS_allocator_strategy == "navie_best_fit") {
+  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
    return AllocatorStrategy::kNaiveBestFit;
  } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
    return AllocatorStrategy::kAutoGrowthBestFit;

--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include <algorithm>
 #include <list>
 #include <map>
@@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) {
  return remaining == 0 ? size : size + alignment - remaining;
 }

-AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator(
+AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
    const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
    size_t alignment)
    : underlying_allocator_(underlying_allocator),
      chunk_size_(align(chunk_size, alignment)),
      alignment_(alignment) {}

-Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
-                                                        Attr attr) {
-  if (size == 0) return nullptr;
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size, Attr attr) {
  size = align(size, alignment_);
  std::lock_guard<std::mutex> guard(mtx_);
  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
  return new Chunk::BlockAllocation(block_it);
 }

-void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) {
+void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
  auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
  auto &blocks = block_it->chunk_->blocks_;


--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
@@ -25,9 +25,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {

-class AutoIncrementBestFitAllocator : public Allocator {
+class AutoGrowthBestFitAllocator : public Allocator {
 public:
-  explicit AutoIncrementBestFitAllocator(
+  explicit AutoGrowthBestFitAllocator(
      const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
      size_t alignment);


--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+static inline size_t AlignTo(size_t size, size_t alignment = 4096) {
+  auto remaining = size % alignment;
+  return remaining == 0 ? size : size + alignment - remaining;
+}
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  FLAGS_allocator_strategy = "auto_growth_best_fit";
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), AlignTo(size));
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), AlignTo(size));
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), AlignTo(size));
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), AlignTo(size));
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
@@ -22,18 +22,18 @@

 #include <iostream>

-#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"

 namespace paddle {
 namespace memory {
 namespace allocation {

-TEST(allocator, auto_increment_best_fit_allocator) {
+TEST(allocator, auto_growth_best_fit_allocator) {
  auto cpu_allocator = std::make_shared<CPUAllocator>();

  auto allocator =
-      std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096);
+      std::make_shared<AutoGrowthBestFitAllocator>(cpu_allocator, 0, 4096);

  std::mutex mtx;
  std::condition_variable cv;
@@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) {
  }
  cv.notify_all();

-  thread_main();
-
  for (auto &th : ths) {
    th.join();
  }
-
-  std::cout << "test ends" << std::endl;
 }

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false,
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_double(initial_gpu_memory_in_mb);
-DECLARE_double(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);

 namespace paddle {
@@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
-        platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
  });

  return a;
@@ -147,28 +144,16 @@ class GPUBuddyAllocatorList {
    PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
    std::call_once(flags_[dev_id], [this, dev_id] {
      platform::SetDeviceId(dev_id);
-      size_t first_size = platform::GpuFirstAllocateChunkSize();
-      size_t re_size = platform::GpuReAllocateChunkSize();
-      allocators_[dev_id] =
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::GPUAllocator(dev_id)),
-                             platform::GpuMinChunkSize(), first_size, re_size);
-      VLOG(2) << "\n\nNOTE: each GPU device use "
-              << string::HumanReadableSize(first_size) << "(initial chunk) "
-              << string::HumanReadableSize(re_size) << "(reallocate chunk) "
-              << "% of GPU memory.\n"
-              << "You can set GFlags environment variable '"
-              << "FLAGS_fraction_of_gpu_memory_to_use"
-              << "' or "
-                 "'FLAGS_initial_gpu_memory_in_mb/"
-                 "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
-                 "of GPU usage.\n\n";
-      VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
-              << FLAGS_fraction_of_gpu_memory_to_use << ", "
-              << "FLAGS_initial_gpu_memory_in_mb="
-              << FLAGS_initial_gpu_memory_in_mb << ", "
-              << "FLAGS_reallocate_gpu_memory_in_mb="
-              << FLAGS_reallocate_gpu_memory_in_mb;
+      allocators_[dev_id] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::GPUAllocator(dev_id)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
    });
    return allocators_[dev_id];
  }
@@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
                            platform::CUDAPinnedMinChunkSize(),
-                            platform::CUDAPinnedMaxChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
  });


--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -14,8 +14,10 @@

 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include <mutex>  // NOLINT
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {

--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
@@ -17,20 +17,37 @@
 #include <cctype>
 #include <fstream>
 #include <limits>
+#include <mutex>  // NOLINT
 #include <sstream>
 #include <string>
+#include <utility>
 #include "paddle/fluid/platform/lock_guard_ptr.h"

-DEFINE_double(buffered_allocator_excess_times, 2,
-              "Tolerant memory size times of buffered_allocator");
+DEFINE_double(
+    buffered_allocator_excess_times, 2,
+    "Excess memory size times of buffered_allocator. BufferedAllocator"
+    " would try to reuse memory freed previously, but the size of freed"
+    " allocation may not be exactly the same as the requested. Here, we"
+    " use a flag to control the excess times of reused memory size. "
+    "Not quite sure what is the best excess times value.");

-DEFINE_string(division_plan_path, "", "Division plan file path");
+DEFINE_string(
+    buffered_allocator_division_plan_path, "",
+    "The file path which "
+    "determines the memory size division plans of BufferedAllocator."
+    "If it is empty, use the default division plan. The file must be a "
+    "text file which each lines indicates the bound of division plan. "
+    "For example, if the text file has 3 lines, which are '500M', '1G', "
+    " '2G', the division plan would be [0, 500M), [500M, 1G), [1G, 2G) "
+    "and [2G, +inf). Allocation request whose requested memory size is "
+    "inside the last interval of division plan would be dispatched to "
+    " underlying_allocator directly without caching when freed.");

 namespace paddle {
 namespace memory {
 namespace allocation {

-std::string TrimStringAndToLowerCase(const std::string &str) {
+static std::string TrimStringAndToUpperCase(const std::string &str) {
  auto not_space = [](char ch) { return std::isspace(ch) == 0; };
  auto first_idx = static_cast<size_t>(
      std::find_if(str.begin(), str.end(), not_space) - str.begin());
@@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) {
      std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
  if (first_idx == str.size() || last_idx == str.size()) return "";

-  last_idx = str.size() - 1 - last_idx;
+  last_idx = str.size() - last_idx;
  auto ret = str.substr(first_idx, last_idx - first_idx);
  std::for_each(ret.begin(), ret.end(),
-                [](char &ch) { ch = std::tolower(ch); });
+                [](char &ch) { ch = std::toupper(ch); });
  return ret;
 }

-static size_t ParseStringToBytes(const std::string &str) {
-  std::string ret = str;
-  if (ret.back() == 'b') {
-    ret.pop_back();
+namespace {
+
+enum DivisionPlanFileStatus { kEOF, kException, kNormal };
+
+}  // NOLINT
+
+static size_t ParseStringToBytes(const std::string &original_str,
+                                 DivisionPlanFileStatus *ret_code) {
+  std::string str = TrimStringAndToUpperCase(original_str);
+
+  if (str.empty()) {
+    *ret_code = kEOF;
+    return 0;
+  }
+
+  if (str.back() == 'B') {
+    str.pop_back();
+    if (str.empty()) {
+      *ret_code = kException;
+      return 0;
+    }
  }

-  PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
  size_t multiples = 1;
-  switch (ret.back()) {
-    case 'g':
+  switch (str.back()) {
+    case 'G':
      multiples *= (static_cast<size_t>(1) << 30);
      break;
-    case 'm':
+    case 'M':
      multiples *= (static_cast<size_t>(1) << 20);
      break;
-    case 'k':
+    case 'K':
      multiples *= (static_cast<size_t>(1) << 10);
      break;
    default:
      break;
  }

-  if (multiples != 1) ret.pop_back();
-  ret = TrimStringAndToLowerCase(ret);
-  double ret_val = 0.0;
-  std::stringstream ss(ret);
-  PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
-  return static_cast<size_t>(ret_val * multiples);
+  if (multiples != 1) {
+    str.pop_back();
+    if (str.empty()) {
+      *ret_code = kException;
+      return 0;
+    }
+  }
+
+  str = TrimStringAndToUpperCase(str);
+  double mem_val = -1.0;
+  std::stringstream ss(str);
+  if (!(ss >> mem_val) || mem_val < 0) {
+    *ret_code = kException;
+    return 0;
+  }
+
+  *ret_code = kNormal;
+  return static_cast<size_t>(mem_val * multiples);
 }

 static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
@@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
  return ret + "]";
 }

-static std::vector<size_t> ReadDivisionPlanFromFile(
+std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
    const std::string &filepath) {
  std::ifstream is(filepath.c_str());
-  PADDLE_ENFORCE(is.good(), "File not exist");
+  PADDLE_ENFORCE(is.good(), "File %s not exist", filepath);
  std::string str;
  std::vector<size_t> plan;
+  size_t line_num = 1;
  while (std::getline(is, str).good()) {
-    str = TrimStringAndToLowerCase(str);
-    if (str.empty()) break;
-    plan.push_back(ParseStringToBytes(str));
+    DivisionPlanFileStatus status;
+    size_t ret = ParseStringToBytes(str, &status);
+    if (status == kEOF) {
+      break;
+    }
+    if (status == kException) {
+      PADDLE_THROW(
+          "Invalid format in line %d of file %s: '%s'. Only support B, KB, MB, "
+          "GB.",
+          line_num, filepath, str);
+    }
+    plan.push_back(ret);
+    ++line_num;
  }
  return plan;
 }
@@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan(
  }
  PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted");

-  // Insert 0 and remove MAX to disivion plan for clean binary searching code
+  // Insert 0 to disivion plan for clean binary searching code
  if (division_plan->empty() || division_plan->front() != 0) {
    division_plan->insert(division_plan->begin(), 0);
  }

+  // Remove MAX from disivion plan for clean binary searching code
  constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max();
  if (division_plan->back() == kSizeTypeMax) {
    division_plan->pop_back();
@@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan(
 }

 static std::vector<size_t> GetDefaultDivisionPlan() {
-  if (!FLAGS_division_plan_path.empty()) {
-    return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
+  if (!FLAGS_buffered_allocator_division_plan_path.empty()) {
+    return ReadBufferedAllocatorDivisionPlanFromFile(
+        FLAGS_buffered_allocator_division_plan_path);
  }

+  // Default division plan is 4K, 8K, 16K, ..., 500M, 1G
  constexpr size_t kMaxLogSize = 30;
-
  std::vector<size_t> plan;
  for (size_t i = 12; i <= kMaxLogSize; ++i) {
    plan.push_back(static_cast<size_t>(1) << i);
  }
-  /*
-  for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
-    plan.push_back(static_cast<size_t>(1) << i);
-  }
-  */
  return plan;
 }

@@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
      division_plan_(division_plan) {
  CheckAndModifyMemoryDivisionPlan(&division_plan_);
  allocations_.resize(division_plan_.size() - 1);
+  accumulated_cache_size_.assign(division_plan_.size() - 1, 0UL);
  mtx_.resize(division_plan_.size() - 1);
  if (underlying_allocator_->IsAllocThreadSafe()) {
    for (auto &mtx : mtx_) {
@@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
    platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
    allocations_[bin_index].emplace(allocation->size(),
                                    AllocationPtr(allocation));
+    accumulated_cache_size_[bin_index] += allocation->size();
  } else {
    underlying_allocator_->Free(allocation);
  }
 }

-// bin_index is not used currently.
 // Maybe we can design more flexible FreeCache strategy based on bin_index
-size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
+// and require size.
+size_t MultiBinBufferedAllocator::ClearCache() {
  size_t accumulated_size = 0;
  // FIXME(zjl): free the largest first when there is no extra
  for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
    platform::LockGuardPtr<std::mutex> lock(mtx_[i]);
-    if (allocations_[i].empty()) continue;
-    auto it = --allocations_[i].end();
-    do {
-      accumulated_size += it->second->size();
-      underlying_allocator_->Free(it->second.release());
-      allocations_[i].erase(it--);
-      if (accumulated_size >= size) {
-        return accumulated_size;
-      }
-    } while (!allocations_[i].empty());
+    allocations_[i].clear();
+    accumulated_size += accumulated_cache_size_[i];
+    accumulated_cache_size_[i] = 0;
  }
  return accumulated_size;
 }
@@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
  auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
  auto upper_size = TolerantUpperSize(size);

-  // if (bin_index >= allocations_.size()) {
-  //  VLOG(2) << "Allocate " << size << " from underlying directly";
-  //}
-
  for (; bin_index < allocations_.size() &&
         upper_size >= division_plan_[bin_index];
       ++bin_index) {
@@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
      size_t sz = it->second->size();
      auto ret = std::move(it->second);
      allocation.erase(it);
+      accumulated_cache_size_[bin_index] -= sz;
      VLOG(3) << "Allocate " << sz << "(required " << size
              << ") from cache directly";
      return ret.release();
@@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
      VLOG(2) << "Allocate " << size << " from underlying directly";
      return ret;
    } catch (BadAlloc &) {
-      VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
-              << " bytes caches";
-      // size_t actual_free_size = FreeCache(size, bin_index);
-      size_t actual_free_size = FreeCache(-1UL, bin_index);
+      size_t actual_free_size = ClearCache();
      VLOG(1) << retry_time << "-th free " << actual_free_size
              << " bytes caches";
      if (actual_free_size == 0) throw;
@@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
  }
 }

+void UseMultiBinBufferedAllocatorGFlags() {}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
@@ -16,6 +16,8 @@

 #include <map>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <string>
 #include <vector>

 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -24,6 +26,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
+    const std::string& filepath);
+
 class MultiBinBufferedAllocator : public Allocator {
 public:
  explicit MultiBinBufferedAllocator(
@@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator {

  bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; }

-  void ClearCache() { FreeCache(static_cast<size_t>(-1), 0); }
+  size_t ClearCache();
+
+  const std::vector<size_t>& DivisionPlan() const { return division_plan_; }

 protected:
  Allocation* AllocateImpl(size_t size, Attr attr) override;
  void FreeImpl(Allocation* allocation) override;

 private:
-  size_t FreeCache(size_t size, size_t bin_index);
-
  std::shared_ptr<Allocator> underlying_allocator_;
  std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
+  std::vector<size_t> accumulated_cache_size_;
  std::vector<size_t> division_plan_;
  std::vector<std::unique_ptr<std::mutex>> mtx_;
 };

+extern void UseMultiBinBufferedAllocatorGFlags();
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator_test.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) {

    {
      underlying_allocator->ResetCounter();
-      allocator->ClearCache();
+      size_t cache_size = allocator->ClearCache();
+      ASSERT_EQ(cache_size, static_cast<size_t>(alloc_size + 2048));
      ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
      ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
    }
+
+    {
+      underlying_allocator->ResetCounter();
+      auto p = allocator->Allocate(allocator->DivisionPlan().back(),
+                                   allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kOne);
+
+    {
+      underlying_allocator->ResetCounter();
+      auto p = allocator->Allocate(allocator->DivisionPlan().back() - 1,
+                                   allocator->kDefault);
+      ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
+      ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
+    }
+
+    ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
  }
 }


--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+DECLARE_bool(enable_buffered_allocator);
+
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  FLAGS_allocator_strategy = "naive_best_fit";
+  FLAGS_enable_buffered_allocator = true;
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -18,6 +18,7 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"

 namespace paddle {

--- a/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc
+++ b/paddle/fluid/memory/allocation/test_multi_bin_buffered_allocator_division_plan.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
+
+DECLARE_string(buffered_allocator_division_plan_path);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(buffered_allocator, division_plan) {
+  std::string path = "/tmp/buffered_allocator_divison_plan";
+  FLAGS_buffered_allocator_division_plan_path = path;
+
+  {
+    std::vector<std::string> plan(
+        {"100b", "300.7K", "500.3m", "1.02gB", "2g", "4G"});
+
+    std::ofstream os(path);
+    for (auto &p : plan) {
+      os << p << std::endl;
+    }
+    os.close();
+  }
+
+  auto plan = ReadBufferedAllocatorDivisionPlanFromFile(
+      FLAGS_buffered_allocator_division_plan_path);
+  ASSERT_EQ(plan.size(), 6UL);
+  ASSERT_EQ(plan[0], 100UL);
+  ASSERT_EQ(plan[1], static_cast<size_t>(300.7 * 1024));
+  ASSERT_EQ(plan[2], static_cast<size_t>(500.3 * 1024 * 1024));
+  ASSERT_EQ(plan[3], static_cast<size_t>(1.02 * 1024 * 1024 * 1024));
+  ASSERT_EQ(plan[4], static_cast<size_t>(2.0 * 1024 * 1024 * 1024));
+  ASSERT_EQ(plan[5], static_cast<size_t>(4.0 * 1024 * 1024 * 1024));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
  return underlying_allocator_->IsAllocThreadSafe();
 }

-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (dynamic_cast<ZeroSizeAllocation *>(allocation)) {
-    delete allocation;
+Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
+  if (size == 0) {
+    return new Allocation(nullptr, 0, place_);
  } else {
-    underlying_allocator_->Free(allocation);
+    return underlying_allocator_->Allocate(size, attr).release();
  }
 }

-Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  if (size == 0) {
-    return new ZeroSizeAllocation(place_);
+void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
+  if (allocation->size() == 0) {
+    delete allocation;
  } else {
-    return underlying_allocator_->Allocate(size, attr).release();
+    underlying_allocator_->Free(allocation);
  }
 }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"

@@ -23,12 +24,6 @@ namespace allocation {
 // The allocator handles the request's size is zero. Allocator will always
 // return an allocation even the request size is zero. However, the
 // allocation.ptr() is nullptr
-class ZeroSizeAllocation : public Allocation {
- public:
-  explicit ZeroSizeAllocation(const platform::Place& p)
-      : Allocation(nullptr, 0, p) {}
-};
-
 class ZeroSizeAllocator : public Allocator {
 public:
  ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,

--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,11 +25,9 @@ namespace detail {

 BuddyAllocator::BuddyAllocator(
    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
+    size_t max_chunk_size)
    : min_chunk_size_(min_chunk_size),
-      first_allocate_chunk_size_(first_allocate_chunk_size),
-      reallocate_chunk_size_(reallocate_chunk_size),
-      max_chunk_size_(first_allocate_chunk_size),
+      max_chunk_size_(max_chunk_size),
      cache_(system_allocator->UseGpu()),
      system_allocator_(std::move(system_allocator)) {}

@@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() {
              "have actually been freed";
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    auto desc = cache_.load(block);
-    VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";
+    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";

-    system_allocator_->Free(block, desc.size, desc.index);
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);
    pool_.erase(pool_.begin());
  }
@@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // if the allocation is huge, send directly to the system allocator
  if (size > max_chunk_size_) {
    VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size, false);
+    return SystemAlloc(size);
  }

  // query and allocate from the existing chunk
@@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // refill the pool if failure
  if (it == pool_.end()) {
    it = RefillPool();
-    // if still failure, try to allocate from SystemAllocator
+    // if still failure, fail fatally
    if (it == pool_.end()) {
-      return SystemAlloc(size, false);
+      return nullptr;
    }
  } else {
    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
@@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) {

  VLOG(10) << "Free from address " << block;

-  if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
    VLOG(10) << "Free directly from system allocator";
    system_allocator_->Free(block, block->total_size(cache_),
                            block->index(cache_));
@@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) {

 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
-size_t BuddyAllocator::GetMaxChunkSize() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return max_chunk_size_;
-}
+size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }

-void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
+void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;
  void* p = system_allocator_->Alloc(&index, size);

@@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {

  if (p == nullptr) return nullptr;

-  static_cast<MemoryBlock*>(p)->init(
-      &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
-                          : MemoryBlock::UNMANAGED_HUGE_CHUNK,
-      index, size, nullptr, nullptr);
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);

  return static_cast<MemoryBlock*>(p)->data();
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-  if (total_used_ + total_free_ > 0) {
-    max_chunk_size_ = reallocate_chunk_size_;
+#ifdef PADDLE_WITH_CUDA
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
  }
+#endif

  // Allocate a new maximum sized block
  size_t index = 0;
-  size_t chunk_size = max_chunk_size_;
-  void* p = system_allocator_->Alloc(&index, chunk_size);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);

  if (p == nullptr) return pool_.end();

@@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
           << " from system allocator";

  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     chunk_size, nullptr, nullptr);
+                                     max_chunk_size_, nullptr, nullptr);

  // gpu fallback allocation
  if (system_allocator_->UseGpu() &&
@@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
    fallback_alloc_count_++;
  }

-  total_free_ += chunk_size;
+  total_free_ += max_chunk_size_;

  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,

 void BuddyAllocator::CleanIdleFallBackAlloc() {
  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;
+  if (!fallback_alloc_count_) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

-    auto desc = cache_.load(block);
-    if (desc.index == 0) {
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
      return;
    }

    VLOG(10) << "Return block " << block << " to fallback allocator.";

-    system_allocator_->Free(block, desc.size, block->index(cache_));
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= desc.size;
+    total_free_ -= max_chunk_size_;
    fallback_alloc_count_--;

    // If no fall allocation exists, return directly
@@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
  if (!shall_free_alloc()) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-    auto desc = cache_.load(block);
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;

-    if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
-      return;
-    }
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

    VLOG(10) << "Return block " << block << " to base allocator.";

-    system_allocator_->Free(block, desc.size, desc.index);
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= desc.size;
+    total_free_ -= max_chunk_size_;

    if (!shall_free_alloc()) return;
  }

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -34,8 +34,7 @@ namespace detail {
 class BuddyAllocator {
 public:
  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
-                 size_t min_chunk_size, size_t first_allocate_chunk_size,
-                 size_t reallocate_chunk_size);
+                 size_t min_chunk_size, size_t max_chunk_size);

  ~BuddyAllocator();

@@ -58,7 +57,7 @@ class BuddyAllocator {
  using PoolSet = std::set<IndexSizeAddress>;

  /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size, bool is_managed = true);
+  void* SystemAlloc(size_t size);

  /*! \brief If existing chunks are not suitable, refill pool */
  PoolSet::iterator RefillPool();
@@ -88,11 +87,7 @@ class BuddyAllocator {
  size_t total_free_ = 0;  // the total size of free memory

  size_t min_chunk_size_;  // the minimum size of each chunk
-
-  size_t first_allocate_chunk_size_;
-  size_t reallocate_chunk_size_;
-
-  size_t max_chunk_size_;
+  size_t max_chunk_size_;  // the maximum size of each chunk

 private:
  /**

--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -27,11 +27,10 @@ class MetadataCache;
 // MemoryBlock::Desc and the payload.
 struct MemoryBlock {
  enum Type {
-    FREE_CHUNK,            // memory is free and idle
-    ARENA_CHUNK,           // memory is being occupied
-    MANAGED_HUGE_CHUNK,    // memory is huge and out of management
-    UNMANAGED_HUGE_CHUNK,  // memory is huge and managed by allocator
-    INVALID_CHUNK          // memory is invalid
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
  };

  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "additional trunks of the same size will be requested from gpu "
              "until the gpu has no memory left for another trunk.");

-DEFINE_double(
-    initial_gpu_memory_in_mb, -1.0,
-    "GPU memory chunk size in MB."
-    "Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
-    "chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
-    "chunk when the first chunk is not enough. This flag has higher priority "
-    "than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0.");
-
-DEFINE_double(reallocate_gpu_memory_in_mb, -1.0,
-              "GPU memory chunk size in MB."
-              "If FLAGS_initial_gpu_memory_in_mb is set and "
-              "FLAGS_reallocate_gpu_memory_in_mb "
-              "is less than 0, it would be replaced by "
-              "FLAGS_initial_gpu_memory_in_mb. Disable "
-              "when FLAGS_initial_gpu_memory_in_mb is less than 0.");
-
 DEFINE_bool(
    enable_cublas_tensor_op_math, false,
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() {

  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                          (total - reserving));
+
  PADDLE_ENFORCE_LE(allocating, available,
                    "Insufficient GPU memory to allocation.");

  return allocating;
 }

-size_t GpuFirstAllocateChunkSize() {
-  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
-    return GpuMaxChunkSize();
-  }
-
-  size_t total = 0;
-  size_t available = 0;
-
-  GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-
-  size_t initial_mem =
-      static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb * (1 << 20));
-  PADDLE_ENFORCE_LE(initial_mem, available,
-                    "Insufficient GPU memory to allocation.");
-  return initial_mem;
-}
-
-size_t GpuReAllocateChunkSize() {
-  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
-    return GpuMaxChunkSize();
-  }
-
-  double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb;
-  if (reallocate_mem < 0) {
-    PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0,
-                   "FLAGS_init_gpu_memory_to_use_mb must be larger than 0");
-    reallocate_mem = FLAGS_initial_gpu_memory_in_mb;
-  }
-
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-  size_t realloc_mem = static_cast<size_t>(reallocate_mem * (1 << 20));
-  PADDLE_ENFORCE_LE(realloc_mem, available,
-                    "Insufficient GPU memory to allocation.");
-  return realloc_mem;
-}
-
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -66,12 +66,6 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();

-//! Get init chunk size for GPU buddy allocator.
-size_t GpuFirstAllocateChunkSize();
-
-//! Get reallocate chunk size for GPU buddy allocator.
-size_t GpuReAllocateChunkSize();
-
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream);

--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/platform/temporary_allocator.h"
+#include <memory>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"

 DEFINE_int64(limit_of_tmp_allocation, -1,

--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -16,6 +16,7 @@
 #include <condition_variable>  // NOLINT
 #include <deque>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -39,6 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
+#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) {
  paddle::platform::CpuTotalPhysicalMemory();

  paddle::memory::allocation::UseAllocatorStrategyGFlag();
+
+  paddle::memory::allocation::UseMultiBinBufferedAllocatorGFlags();
+
  m.doc() = "C++ core of PaddlePaddle";

  // using framework in this function. Since it is inside a function, it will

--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) {
  Fprintf(std::cout, fmt, args...);
 }

-template <typename T>
-std::string HumanReadableSize(T size) {
+inline std::string HumanReadableSize(double f_size) {
  size_t i = 0;
-  double f_size = static_cast<double>(size);
  double orig = f_size;
  const std::vector<std::string> units(
      {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size > 1024) {
+  while (f_size >= 1024) {
    f_size /= 1024;
    i++;
  }

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -130,7 +130,8 @@ def __bootstrap__():
        'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
        'allocator_strategy', 'enable_buffered_allocator',
-        'buffered_allocator_excess_times', 'reader_queue_speed_test_mode',
+        'buffered_allocator_excess_times',
+        'buffered_allocator_division_plan_path', 'reader_queue_speed_test_mode',
        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
        'inner_op_parallelism', 'enable_parallel_graph',
        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
@@ -163,7 +164,6 @@ def __bootstrap__():

    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb',
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',