best_fit_allocator.h 4.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <array>
#include <list>
#include <map>
#include "paddle/fluid/memory/allocation/allocator.h"

namespace paddle {
namespace memory {
namespace allocation {
namespace details {
struct Chunk {
  bool is_free{true};
  // Offset to the base allocation.
  uintptr_t offset_;
  size_t size_;
};

// Here we use std::list to maintain chunk list.
// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
// `Payload`. Such as
//   *-------*---------------*---------------*--------------*
//   | Chunk | prev_ pointer | next_ pointer | payload .... |
//   *-------*---------------*---------------*--------------*
// This implementation can just return a raw pointer, and we can get the list
Y
Yu Yang 已提交
40 41
// structure by the raw pointer. However, we cannot use the same code on GPU
// since CPU cannot access GPU memory directly.
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
//
// So we choose to use `std::list` and return an allocation instance, which
// contains the list node iterator, then we can unify CPU/GPU code.
//
// To return an allocation is not a bad idea, since Tensor/Vector should holds
// an allocation instead of raw pointer directly.
using ChunkList = std::list<Chunk>;

// Here we use a multi-level map of free chunks.
// the map is
//      MSB offset --> size --> [ChunkList::iterator]
//
// The time complexities:
//     find a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     find the position of a chunk iterator:
//          O(logN + K),
//               where N is the number of free nodes with the same MSB offset.
//               where K is the number of free nodes with the same size.
//     insert a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     erase a free chunk:
//          O(1)
using FreeChunkBin =
    std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
}  // namespace details

class BestFitAllocator;

// The BestFitAllocation maintain the List Node iterator.
class BestFitAllocation : public Allocation {
 private:
  using ListIt = typename details::ChunkList::iterator;

 public:
  BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);

  const ListIt& ChunkIterator() const { return chunk_it_; }

 private:
  BestFitAllocator* allocator_;
  typename details::ChunkList::iterator chunk_it_;
};

// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
// safe, we must wrap a locked_allocator. However, we can implement a thread
// safe allocator by locking each bin and chunks list independently. It will
// make BestFitAllocator faster in multi-thread situation.
//
// This allocator implements a best-fit allocator with merging the free nodes.
//
// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
// is larger than request size, the original block will be split into two
// chunks. The first block will be used and the second block will be put into
// free chunks.
//
// To free an allocation, it will set the chunk of allocation to free and merge
// the prev-chunk and the next-chunk when possible.
class BestFitAllocator : public UnmanagedAllocator {
 public:
  explicit BestFitAllocator(Allocation* allocation);

  void* BasePtr() const { return allocation_->ptr(); }

  const platform::Place& Place() const { return allocation_->place(); }

  std::unique_ptr<Allocation> Allocate(size_t size,
                                       Attr attr = kDefault) override;
  void Free(Allocation* allocation) override;

  size_t NumFreeChunks() const;

 private:
  size_t FreeSize() const;
  using MapIt = typename details::FreeChunkBin::value_type::iterator;
  using ListIt = typename details::ChunkList::iterator;

  ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
                    MapIt bin_iterator);
  void EraseFreeNode(const ListIt& it);
  void InsertFreeNode(const ListIt& it);

  Allocation* allocation_;  // not owned
  details::ChunkList chunks_;
  details::FreeChunkBin free_chunks_;
};
}  // namespace allocation
}  // namespace memory
}  // namespace paddle