diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 1d9e7177f9547af8a083b2304163497b987fac46..706216c8bfdcc85a891572c4290b5ba8ac76f360 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -41,8 +41,7 @@ BestFitAllocator::BestFitAllocator(Allocation* allocation)
   chunk.offset_ = 0;
   chunk.is_free = true;
   chunks_.emplace_back(chunk);
-  free_chunks_[HighestBitPos(chunk.size_)].insert(
-      {chunk.size_, chunks_.begin()});
+  InsertFreeNode(chunks_.begin());
 }
 
 std::unique_ptr<Allocation> BestFitAllocator::Allocate(size_t size, Attr attr) {
@@ -86,35 +85,33 @@ BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
   details::Chunk remaining;
   to_use.size_ = request_size;
   to_use.is_free = false;
-  remaining.size_ = remaining_size;
-  remaining.is_free = true;
-
   // calc offsets
   to_use.offset_ = to_split_it->offset_;
-  remaining.offset_ = to_use.offset_ + to_use.size_;
 
   // insert to chunk list
   auto to_use_it = chunks_.insert(to_split_it, to_use);
-  if (remaining.size_ != 0) {
-    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
-    free_chunks_[bit_size].insert(
-        {remaining.size_, chunks_.insert(to_split_it, remaining)});
+  if (remaining_size != 0) {
+    remaining.size_ = remaining_size;
+    remaining.is_free = true;
+    remaining.offset_ = to_use.offset_ + to_use.size_;
+    auto remaining_it = chunks_.insert(to_split_it, remaining);
+    InsertFreeNode(remaining_it);
   }
   chunks_.erase(to_split_it);
   return to_use_it;
 }
 
 void BestFitAllocator::Free(Allocation* allocation) {
-  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
+  auto* bf_allocation = reinterpret_cast<BestFitAllocation*>(allocation);
   auto chunk_it = bf_allocation->ChunkIterator();
   PADDLE_ENFORCE(!chunk_it->is_free);
   chunk_it->is_free = true;
-  if (chunk_it != chunks_.begin()) {
+  if (chunk_it != chunks_.begin()) {  // not the first chunk, try to merge prev.
     auto prev_it = chunk_it;
     --prev_it;
 
     if (prev_it->is_free) {
-      // Merge Left.
+      // Merge Prev.
       EraseFreeNode(prev_it);
       prev_it->size_ += chunk_it->size_;
       chunks_.erase(chunk_it);
@@ -125,6 +122,7 @@ void BestFitAllocator::Free(Allocation* allocation) {
   auto next_it = chunk_it;
   ++next_it;
   if (next_it != chunks_.end() && next_it->is_free) {
+    // not the last chunk, try to merge next
     EraseFreeNode(next_it);
     chunk_it->size_ += next_it->size_;
     chunks_.erase(next_it);
@@ -139,9 +137,11 @@ void BestFitAllocator::InsertFreeNode(const ListIt& it) {
   free_map.insert({it->size_, it});
 }
 void BestFitAllocator::EraseFreeNode(const ListIt& it) {
-  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
+  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
   auto& free_map = free_chunks_[pos];
   auto map_it = free_map.find(it->size_);
+
+  // This while loop because it is a multi-map
   while (map_it->second != it && map_it != free_map.end()) {
     ++map_it;
   }
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 309a2a77088d6d2d182590f8b2671ea0b5a474f2..da62bc4bb61e09f8b49cdc5e67f7932aa21a860e 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -37,8 +37,8 @@ struct Chunk {
 //   | Chunk | prev_ pointer | next_ pointer | payload .... |
 //   *-------*---------------*---------------*--------------*
 // This implementation can just return a raw pointer, and we can get the list
-// structure by it. However, we cannot use the same code on GPU since CPU
-// cannot access GPU memory directly.
+// structure by the raw pointer. However, we cannot use the same code on GPU
+// since CPU cannot access GPU memory directly.
 //
 // So we choose to use `std::list` and return an allocation instance, which
 // contains the list node iterator, then we can unify CPU/GPU code.