diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5eea4948339fb8c5584a5b9f5d71c752e..2462ba084b996b33cf95925ea6d2c476b53f4182 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,161 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size) {
+  PADDLE_ASSERT(min_chunk_size > 0);
+  PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
+
+  system_allocator_ = std::move(system_allocator);
+  min_chunk_size_ = min_chunk_size;
+  max_chunk_size_ = max_chunk_size;
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+  } else {
+    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  // if still failure, fail fatally
+  if (it == pool_.end()) {
+    return nullptr;
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << " Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert({index, max_chunk_size_, p}).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound({index, size, nullptr});
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+
+  pool_.erase(it);
+
+  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+
+  block->split(cache_, size);
+
+  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
+                 << ", " << block->right_buddy(cache_)->total_size(cache_)
+                 << ")";
+
+      pool_.insert({block->right_buddy(cache_)->index(cache_),
+                    block->right_buddy(cache_)->total_size(cache_),
+                    block->right_buddy(cache_)});
+    }
+  }
+
+  return block;
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc719966b4074449ce1ef7193c73dc265..38bedc9a18366c328e17da744dbb16b0940bbeef 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -15,9 +15,15 @@
 #pragma once
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/metadata.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
+#include <set>
 #include <mutex>
 #include <vector>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -25,55 +31,83 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple type: allocator index, memory size, memory address
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /** 
+   *  \brief Find the suitable chunk from existing pool
+   *  
+   *  \param it   pool iterator which contains suitable block.
+   *  \param size the size of allocation.
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation  */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  PoolSet pool_;
+
+ private:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+  MetadataCache cache_;
+
+ private:
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
 };
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = nullptr;
   if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
+    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
+                           platform::CpuMaxChunkSize());
   }
   return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
 
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    int gpu_num = platform::GpuDeviceCount();
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] =
+          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize());
     }
   }
   return as[gpu_id];