diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 706216c8bfdcc85a891572c4290b5ba8ac76f360..8cc943c861acff4abb148bde1397343f03e896b1 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -26,7 +26,7 @@ static int HighestBitPos(size_t N) {
   if (UNLIKELY(N == 0)) {
     return 0;
   } else {
-#ifdef __GNUC__
+#ifdef __GNUCC__
     return sizeof(unsigned int) * 8 - __builtin_clz(N);
 #else
     return static_cast<int>(std::log2(N) + 1);
@@ -41,7 +41,8 @@ BestFitAllocator::BestFitAllocator(Allocation* allocation)
   chunk.offset_ = 0;
   chunk.is_free = true;
   chunks_.emplace_back(chunk);
-  InsertFreeNode(chunks_.begin());
+  free_chunks_[HighestBitPos(chunk.size_)].insert(
+      {chunk.size_, chunks_.begin()});
 }
 
 std::unique_ptr<Allocation> BestFitAllocator::Allocate(size_t size, Attr attr) {
@@ -85,33 +86,35 @@ BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
   details::Chunk remaining;
   to_use.size_ = request_size;
   to_use.is_free = false;
+  remaining.size_ = remaining_size;
+  remaining.is_free = true;
+
   // calc offsets
   to_use.offset_ = to_split_it->offset_;
+  remaining.offset_ = to_use.offset_ + to_use.size_;
 
   // insert to chunk list
   auto to_use_it = chunks_.insert(to_split_it, to_use);
-  if (remaining_size != 0) {
-    remaining.size_ = remaining_size;
-    remaining.is_free = true;
-    remaining.offset_ = to_use.offset_ + to_use.size_;
-    auto remaining_it = chunks_.insert(to_split_it, remaining);
-    InsertFreeNode(remaining_it);
+  if (remaining.size_ != 0) {
+    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
+    free_chunks_[bit_size].insert(
+        {remaining.size_, chunks_.insert(to_split_it, remaining)});
   }
   chunks_.erase(to_split_it);
   return to_use_it;
 }
 
 void BestFitAllocator::Free(Allocation* allocation) {
-  auto* bf_allocation = reinterpret_cast<BestFitAllocation*>(allocation);
+  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   auto chunk_it = bf_allocation->ChunkIterator();
   PADDLE_ENFORCE(!chunk_it->is_free);
   chunk_it->is_free = true;
-  if (chunk_it != chunks_.begin()) {  // not the first chunk, try to merge prev.
+  if (chunk_it != chunks_.begin()) {
     auto prev_it = chunk_it;
     --prev_it;
 
     if (prev_it->is_free) {
-      // Merge Prev.
+      // Merge Left.
       EraseFreeNode(prev_it);
       prev_it->size_ += chunk_it->size_;
       chunks_.erase(chunk_it);
@@ -122,7 +125,6 @@ void BestFitAllocator::Free(Allocation* allocation) {
   auto next_it = chunk_it;
   ++next_it;
   if (next_it != chunks_.end() && next_it->is_free) {
-    // not the last chunk, try to merge next
     EraseFreeNode(next_it);
     chunk_it->size_ += next_it->size_;
     chunks_.erase(next_it);
@@ -137,11 +139,9 @@ void BestFitAllocator::InsertFreeNode(const ListIt& it) {
   free_map.insert({it->size_, it});
 }
 void BestFitAllocator::EraseFreeNode(const ListIt& it) {
-  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
+  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
   auto& free_map = free_chunks_[pos];
   auto map_it = free_map.find(it->size_);
-
-  // This while loop because it is a multi-map
   while (map_it->second != it && map_it != free_map.end()) {
     ++map_it;
   }
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 323780aa8b00d5d996a5487aa9e8d11e3ddf6ede..e5b3c938c618c290343c793bbd5dc0f8dd764521 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -42,8 +42,7 @@ static std::shared_ptr<memory::Allocation> GetCommunicationAllocationFromTensor(
 
     memory::Copy(cuda_pinned, result->ptr(),
                  boost::get<platform::CUDAPlace>(tensor.place()),
-                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
-                 gpu_dev_ctx.stream());
+                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
 
     ctx.Wait();
     return result;