Revert buggy changes

test=develop

Revert buggy changes
test=develop
71c846ef · Yu Yang · dbf9f6f4 · 71c846ef · 71c846ef
2 changed file
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -26,7 +26,7 @@ static int HighestBitPos(size_t N) {
  if (UNLIKELY(N == 0)) {
    return 0;
  } else {
-#ifdef __GNUC__
+#ifdef __GNUCC__
    return sizeof(unsigned int) * 8 - __builtin_clz(N);
 #else
    return static_cast<int>(std::log2(N) + 1);
@@ -41,7 +41,8 @@ BestFitAllocator::BestFitAllocator(Allocation* allocation)
  chunk.offset_ = 0;
  chunk.is_free = true;
  chunks_.emplace_back(chunk);
-  InsertFreeNode(chunks_.begin());
+  free_chunks_[HighestBitPos(chunk.size_)].insert(
+      {chunk.size_, chunks_.begin()});
 }
 std::unique_ptr<Allocation> BestFitAllocator::Allocate(size_t size, Attr attr) {
@@ -85,33 +86,35 @@ BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
  details::Chunk remaining;
  to_use.size_ = request_size;
  to_use.is_free = false;
+  remaining.size_ = remaining_size;
+  remaining.is_free = true;
  // calc offsets
  to_use.offset_ = to_split_it->offset_;
+  remaining.offset_ = to_use.offset_ + to_use.size_;
  // insert to chunk list
  auto to_use_it = chunks_.insert(to_split_it, to_use);
-  if (remaining_size != 0) {
+  if (remaining.size_ != 0) {
-    remaining.size_ = remaining_size;
+    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
-    remaining.is_free = true;
+    free_chunks_[bit_size].insert(
-    remaining.offset_ = to_use.offset_ + to_use.size_;
+        {remaining.size_, chunks_.insert(to_split_it, remaining)});
-    auto remaining_it = chunks_.insert(to_split_it, remaining);
-    InsertFreeNode(remaining_it);
  }
  chunks_.erase(to_split_it);
  return to_use_it;
 }
 void BestFitAllocator::Free(Allocation* allocation) {
-  auto* bf_allocation = reinterpret_cast<BestFitAllocation*>(allocation);
+  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
  auto chunk_it = bf_allocation->ChunkIterator();
  PADDLE_ENFORCE(!chunk_it->is_free);
  chunk_it->is_free = true;
-  if (chunk_it != chunks_.begin()) {  // not the first chunk, try to merge prev.
+  if (chunk_it != chunks_.begin()) {
    auto prev_it = chunk_it;
    --prev_it;
    if (prev_it->is_free) {
-      // Merge Prev.
+      // Merge Left.
      EraseFreeNode(prev_it);
      prev_it->size_ += chunk_it->size_;
      chunks_.erase(chunk_it);
@@ -122,7 +125,6 @@ void BestFitAllocator::Free(Allocation* allocation) {
  auto next_it = chunk_it;
  ++next_it;
  if (next_it != chunks_.end() && next_it->is_free) {
-    // not the last chunk, try to merge next
    EraseFreeNode(next_it);
    chunk_it->size_ += next_it->size_;
    chunks_.erase(next_it);
@@ -137,11 +139,9 @@ void BestFitAllocator::InsertFreeNode(const ListIt& it) {
  free_map.insert({it->size_, it});
 }
 void BestFitAllocator::EraseFreeNode(const ListIt& it) {
-  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
+  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
  auto& free_map = free_chunks_[pos];
  auto map_it = free_map.find(it->size_);
-  // This while loop because it is a multi-map
  while (map_it->second != it && map_it != free_map.end()) {
    ++map_it;
  }

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -42,8 +42,7 @@ static std::shared_ptr<memory::Allocation> GetCommunicationAllocationFromTensor(
    memory::Copy(cuda_pinned, result->ptr(),
                 boost::get<platform::CUDAPlace>(tensor.place()),
-                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
+                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-                 gpu_dev_ctx.stream());
    ctx.Wait();
    return result;