best_fit_allocator.h 4.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
16
#include <stdint.h>
17 18 19
#include <array>
#include <list>
#include <map>
W
wanghuancoder 已提交
20

21 22
#include "paddle/fluid/memory/allocation/allocator.h"

W
wanghuancoder 已提交
23 24 25 26 27 28
namespace paddle {
namespace platform {
class Place;
}  // namespace platform
}  // namespace paddle

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
namespace paddle {
namespace memory {
namespace allocation {
namespace details {
struct Chunk {
  bool is_free{true};
  // Offset to the base allocation.
  uintptr_t offset_;
  size_t size_;
};

// Here we use std::list to maintain chunk list.
// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
// `Payload`. Such as
//   *-------*---------------*---------------*--------------*
//   | Chunk | prev_ pointer | next_ pointer | payload .... |
//   *-------*---------------*---------------*--------------*
// This implementation can just return a raw pointer, and we can get the list
Y
Yu Yang 已提交
48 49
// structure by the raw pointer. However, we cannot use the same code on GPU
// since CPU cannot access GPU memory directly.
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
//
// So we choose to use `std::list` and return an allocation instance, which
// contains the list node iterator, then we can unify CPU/GPU code.
//
// To return an allocation is not a bad idea, since Tensor/Vector should holds
// an allocation instead of raw pointer directly.
using ChunkList = std::list<Chunk>;

// Here we use a multi-level map of free chunks.
// the map is
//      MSB offset --> size --> [ChunkList::iterator]
//
// The time complexities:
//     find a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     find the position of a chunk iterator:
//          O(logN + K),
//               where N is the number of free nodes with the same MSB offset.
//               where K is the number of free nodes with the same size.
//     insert a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     erase a free chunk:
//          O(1)
using FreeChunkBin =
    std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
}  // namespace details

class BestFitAllocator;

// The BestFitAllocation maintain the List Node iterator.
Y
Yu Yang 已提交
82
class BestFitAllocation : public Allocation {
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
 private:
  using ListIt = typename details::ChunkList::iterator;

 public:
  BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);

  const ListIt& ChunkIterator() const { return chunk_it_; }

 private:
  typename details::ChunkList::iterator chunk_it_;
};

// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
// safe, we must wrap a locked_allocator. However, we can implement a thread
// safe allocator by locking each bin and chunks list independently. It will
// make BestFitAllocator faster in multi-thread situation.
//
// This allocator implements a best-fit allocator with merging the free nodes.
//
// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
// is larger than request size, the original block will be split into two
// chunks. The first block will be used and the second block will be put into
// free chunks.
//
// To free an allocation, it will set the chunk of allocation to free and merge
// the prev-chunk and the next-chunk when possible.
Y
Yu Yang 已提交
109
class BestFitAllocator : public Allocator {
110
 public:
111
  explicit BestFitAllocator(pten::Allocation* allocation);
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128

  void* BasePtr() const { return allocation_->ptr(); }

  const platform::Place& Place() const { return allocation_->place(); }

  size_t NumFreeChunks() const;

 private:
  size_t FreeSize() const;
  using MapIt = typename details::FreeChunkBin::value_type::iterator;
  using ListIt = typename details::ChunkList::iterator;

  ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
                    MapIt bin_iterator);
  void EraseFreeNode(const ListIt& it);
  void InsertFreeNode(const ListIt& it);

Y
Yu Yang 已提交
129
 protected:
130 131
  void FreeImpl(pten::Allocation* allocation) override;
  pten::Allocation* AllocateImpl(size_t size) override;
Y
Yu Yang 已提交
132 133

 private:
134
  pten::Allocation* allocation_;  // not owned
135 136 137 138 139 140
  details::ChunkList chunks_;
  details::FreeChunkBin free_chunks_;
};
}  // namespace allocation
}  // namespace memory
}  // namespace paddle