diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 706216c8bfdcc85a891572c4290b5ba8ac76f360..8cc943c861acff4abb148bde1397343f03e896b1 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -26,7 +26,7 @@ static int HighestBitPos(size_t N) { if (UNLIKELY(N == 0)) { return 0; } else { -#ifdef __GNUC__ +#ifdef __GNUCC__ return sizeof(unsigned int) * 8 - __builtin_clz(N); #else return static_cast(std::log2(N) + 1); @@ -41,7 +41,8 @@ BestFitAllocator::BestFitAllocator(Allocation* allocation) chunk.offset_ = 0; chunk.is_free = true; chunks_.emplace_back(chunk); - InsertFreeNode(chunks_.begin()); + free_chunks_[HighestBitPos(chunk.size_)].insert( + {chunk.size_, chunks_.begin()}); } std::unique_ptr BestFitAllocator::Allocate(size_t size, Attr attr) { @@ -85,33 +86,35 @@ BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size, details::Chunk remaining; to_use.size_ = request_size; to_use.is_free = false; + remaining.size_ = remaining_size; + remaining.is_free = true; + // calc offsets to_use.offset_ = to_split_it->offset_; + remaining.offset_ = to_use.offset_ + to_use.size_; // insert to chunk list auto to_use_it = chunks_.insert(to_split_it, to_use); - if (remaining_size != 0) { - remaining.size_ = remaining_size; - remaining.is_free = true; - remaining.offset_ = to_use.offset_ + to_use.size_; - auto remaining_it = chunks_.insert(to_split_it, remaining); - InsertFreeNode(remaining_it); + if (remaining.size_ != 0) { + auto bit_size = static_cast(HighestBitPos(remaining.size_)); + free_chunks_[bit_size].insert( + {remaining.size_, chunks_.insert(to_split_it, remaining)}); } chunks_.erase(to_split_it); return to_use_it; } void BestFitAllocator::Free(Allocation* allocation) { - auto* bf_allocation = reinterpret_cast(allocation); + auto* bf_allocation = dynamic_cast(allocation); auto chunk_it = bf_allocation->ChunkIterator(); PADDLE_ENFORCE(!chunk_it->is_free); chunk_it->is_free = true; - if (chunk_it != chunks_.begin()) { // not the first chunk, try to merge prev. + if (chunk_it != chunks_.begin()) { auto prev_it = chunk_it; --prev_it; if (prev_it->is_free) { - // Merge Prev. + // Merge Left. EraseFreeNode(prev_it); prev_it->size_ += chunk_it->size_; chunks_.erase(chunk_it); @@ -122,7 +125,6 @@ void BestFitAllocator::Free(Allocation* allocation) { auto next_it = chunk_it; ++next_it; if (next_it != chunks_.end() && next_it->is_free) { - // not the last chunk, try to merge next EraseFreeNode(next_it); chunk_it->size_ += next_it->size_; chunks_.erase(next_it); @@ -137,11 +139,9 @@ void BestFitAllocator::InsertFreeNode(const ListIt& it) { free_map.insert({it->size_, it}); } void BestFitAllocator::EraseFreeNode(const ListIt& it) { - auto pos = static_cast(HighestBitPos(it->size_)); + size_t pos = static_cast(HighestBitPos(it->size_)); auto& free_map = free_chunks_[pos]; auto map_it = free_map.find(it->size_); - - // This while loop because it is a multi-map while (map_it->second != it && map_it != free_map.end()) { ++map_it; } diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 323780aa8b00d5d996a5487aa9e8d11e3ddf6ede..e5b3c938c618c290343c793bbd5dc0f8dd764521 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -42,8 +42,7 @@ static std::shared_ptr GetCommunicationAllocationFromTensor( memory::Copy(cuda_pinned, result->ptr(), boost::get(tensor.place()), - reinterpret_cast(tensor.data()), copy_size, - gpu_dev_ctx.stream()); + tensor.data(), copy_size, gpu_dev_ctx.stream()); ctx.Wait(); return result;