// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <stdint.h>
#include <array>
#include <list>
#include <map>

#include "paddle/fluid/memory/allocation/allocator.h"

namespace paddle {
namespace platform {
class Place;
}  // namespace platform
}  // namespace paddle

namespace paddle {
namespace memory {
namespace allocation {
namespace details {
struct Chunk {
  bool is_free{true};
  // Offset to the base allocation.
  uintptr_t offset_;
  size_t size_;
};

// Here we use std::list to maintain chunk list.
// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
// `Payload`. Such as
//   *-------*---------------*---------------*--------------*
//   | Chunk | prev_ pointer | next_ pointer | payload .... |
//   *-------*---------------*---------------*--------------*
// This implementation can just return a raw pointer, and we can get the list
// structure by the raw pointer. However, we cannot use the same code on GPU
// since CPU cannot access GPU memory directly.
//
// So we choose to use `std::list` and return an allocation instance, which
// contains the list node iterator, then we can unify CPU/GPU code.
//
// To return an allocation is not a bad idea, since Tensor/Vector should holds
// an allocation instead of raw pointer directly.
using ChunkList = std::list<Chunk>;

// Here we use a multi-level map of free chunks.
// the map is
//      MSB offset --> size --> [ChunkList::iterator]
//
// The time complexities:
//     find a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     find the position of a chunk iterator:
//          O(logN + K),
//               where N is the number of free nodes with the same MSB offset.
//               where K is the number of free nodes with the same size.
//     insert a free chunk:
//          O(logN),
//               where N is the number of free nodes with the same MSB offset.
//     erase a free chunk:
//          O(1)
using FreeChunkBin =
    std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
}  // namespace details

class BestFitAllocator;

// The BestFitAllocation maintain the List Node iterator.
class BestFitAllocation : public Allocation {
 private:
  using ListIt = typename details::ChunkList::iterator;

 public:
  BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);

  const ListIt& ChunkIterator() const { return chunk_it_; }

 private:
  typename details::ChunkList::iterator chunk_it_;
};

// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
// safe, we must wrap a locked_allocator. However, we can implement a thread
// safe allocator by locking each bin and chunks list independently. It will
// make BestFitAllocator faster in multi-thread situation.
//
// This allocator implements a best-fit allocator with merging the free nodes.
//
// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
// is larger than request size, the original block will be split into two
// chunks. The first block will be used and the second block will be put into
// free chunks.
//
// To free an allocation, it will set the chunk of allocation to free and merge
// the prev-chunk and the next-chunk when possible.
class BestFitAllocator : public Allocator {
 public:
  explicit BestFitAllocator(pten::Allocation* allocation);

  void* BasePtr() const { return allocation_->ptr(); }

  const platform::Place& Place() const { return allocation_->place(); }

  size_t NumFreeChunks() const;

 private:
  size_t FreeSize() const;
  using MapIt = typename details::FreeChunkBin::value_type::iterator;
  using ListIt = typename details::ChunkList::iterator;

  ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
                    MapIt bin_iterator);
  void EraseFreeNode(const ListIt& it);
  void InsertFreeNode(const ListIt& it);

 protected:
  void FreeImpl(pten::Allocation* allocation) override;
  pten::Allocation* AllocateImpl(size_t size) override;

 private:
  pten::Allocation* allocation_;  // not owned
  details::ChunkList chunks_;
  details::FreeChunkBin free_chunks_;
};
}  // namespace allocation
}  // namespace memory
}  // namespace paddle