Add comments and polish code style

15076c32 · Yu Yang · b4f54d33 · 15076c32 · 15076c32 · 15076c32
19 changed file
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "../memory/allocation/allocator.h"
 #include "paddle/fluid/framework/data_type.h"

 namespace paddle {
@@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  dst->set_layout(src.layout());
  auto src_place = src.place();
  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(dst_place, src.type(),
-                                   memory::Allocator::kCommunication);
+  auto dst_ptr =
+      dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,

--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -21,6 +21,11 @@ namespace allocation {
 ThinAlignedAllocator::ThinAlignedAllocator(
    std::shared_ptr<ManagedAllocator> underlyning_allocator)
    : underlying_allocator_(std::move(underlyning_allocator)) {}
+
+std::shared_ptr<Allocation> ThinAlignedAllocator::AllocateShared(
+    size_t size, Allocator::Attr attr) {
+  return std::shared_ptr<Allocation>(Allocate(size, attr).release());
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -20,34 +20,66 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// The aligned allocation and allocator will wrap a managed allocator,
+// and returns the aligned pointer.
+//
+// NOTE(yy): For speed reason, I just use a template parameter to get
+// alignment, however, it can be an private member if necessary.
+//
+// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
 template <size_t kAlignment>
 class AlignedAllocation : public Allocation {
 public:
  AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
                    size_t size)
-      : Allocation(AlignedPtr(underlying_allocation->ptr()), size,
+      : Allocation(AlignedPtr(underlying_allocation->ptr()),
+                   size + kAlignment - Offset(underlying_allocation->ptr()),
                   underlying_allocation->place()),
        underlying_allocation_(std::move(underlying_allocation)) {}

 private:
  static void* AlignedPtr(void* ptr) {
-    auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
-    ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment;
-    return reinterpret_cast<void*>(ptr_addr);
+    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
+                                   Offset(ptr));
+  }
+
+  // Offset to aligned pointer.
+  // if ptr is already aligned, returns 0.
+  static size_t Offset(void* ptr) {
+    auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
+    intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
+    intptr_t diff = aligned_addr - ptr_addr;
+    if (diff == 0) {
+      return 0;
+    } else {
+      return kAlignment + diff;
+    }
  }

  std::unique_ptr<Allocation> underlying_allocation_;
 };

+// Thin aligned allocator is trivial and used to generate a small size binary.
+//
+// NOTE(yy): This is a trick to make a template class. This class extract the
+// common code into a `thin` class. So if there are multiple specification of
+// the template class, the binary size will not extended too much.
+//
+// NOTE(yy): This could be an over design. If it harms readability of code, it
+// could be removed later.
 class ThinAlignedAllocator : public ManagedAllocator {
 public:
  explicit ThinAlignedAllocator(
      std::shared_ptr<ManagedAllocator> underlyning_allocator);

+  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
+
 protected:
  std::shared_ptr<ManagedAllocator> underlying_allocator_;
 };

+// An aligned allocator will allocate `size+kAlignment` allocation and adjust
+// the pointer offset.
 template <size_t kAlignment>
 class AlignedAllocator : public ThinAlignedAllocator {
 public:
@@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
    return std::unique_ptr<Allocation>(
        new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
  }
-  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    return std::shared_ptr<Allocation>(Allocate(size, attr).release());
-  }
 };

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
@@ -18,6 +18,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+
+// NOTE(yy): this unittest is not important. It just used for debugging.
+// It can be removed later.
 struct FillZero {
 public:
  float* ptr_;

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -12,6 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <utility>
+
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once
 #include <memory>
 #include <string>
@@ -21,15 +37,22 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
 public:
-  explicit BadAlloc(const std::string& msg) : msg_(msg) {}
+  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
  const char* what() const noexcept override;

 private:
  std::string msg_;
 };

+// Allocation is the object holding the actually pointer. Use
+// `Allocation::ptr()` will returns the pointer that allocated.
+//
+// NOTE: this is the base class of Allocation. Each allocator can use its own
+//       allocation object.
+// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
 class Allocation {
 public:
  Allocation(void* ptr, size_t size, platform::Place place)
@@ -38,8 +61,22 @@ class Allocation {
  Allocation(const Allocation& o) = delete;
  Allocation& operator=(const Allocation& o) = delete;

+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
  void* ptr() const { return ptr_; }

+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
  size_t size() const { return size_; }

  const platform::Place& place() const { return place_; }
@@ -52,22 +89,51 @@ class Allocation {
  platform::Place place_;
 };

+// Base interface class of memory Allocator.
+// To allocate a memory, allocator needs two parameters:
+//    1. size of bytes.
+//    2. Attribute of memory.
+// NOTE: the attribute of memory might be ignored if the allocator does not
+// care it.
 class Allocator {
 public:
  enum Attr {
-    kDefault = 0,
-    kTiny = 1,
-    kFixedHuge = 2,
-    kFluxHuge = 3,
-    kTmp = 4,
-    kCommunication = 5,
-    NumOfAttrs = 6
+    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
+                   // algorithm.
+
+    kFixedHuge = 1,  // The allocation may not be freed until the program
+                     // ends. e.g., `Parameters` and `Momentum`.
+
+    kFluxHuge = 2,  // The allocation may create and freed frequently and the
+                    // allocation is considerable huge. Like `activations`
+                    // and gradients.
+
+    kScratchpad =
+        3,  // The `Scratchpad` memory is allocated and freed very soon,
+            // usually within an operator or aux memory.
+            // Like CUDNN workspace, AUX memory in batch norm, etc.
+            //
+            // https://en.wikipedia.org/wiki/Scratchpad_memory
+
+    kCrossDevice =
+        4,  // The memory used cross-device memory copy/communication.
+            // For example:
+            // 1. it can use an `pinned` memory for CPU-GPU
+            //    communication.
+            // 2. it can use an `registered` memory for RDMA
+            //    communication.
+
+    NumOfAttrs = 5  // The number of all attributes. It is used internally.
  };

  virtual ~Allocator();
+
+  // Allocate an allocation. Note the return allocation might need to be freed
+  // manually if the Allocator is an `UnmanagedAllocator`.
  virtual std::unique_ptr<Allocation> Allocate(
      size_t size, Allocator::Attr attr = kDefault) = 0;

+  // True if the `Allocate` is thread safe.
  virtual bool IsAllocThreadSafe() const;
 };

@@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator {
  }
 };

-// The allocation will be managed by smart pointers
+// The allocation will be managed by smart pointers. i.e., users do not need
+// to free allocation manually.
 class ManagedAllocator : public Allocator {
 public:
  virtual std::shared_ptr<Allocation> AllocateShared(

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator {
            std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}

  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
-    if (attr == kCommunication) {
+    if (attr == kCrossDevice) {
      return communication_allocator_->Allocate(size, attr);
    } else {
      return normal_allocator_->Allocate(size, attr);
@@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator {
  }

  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    if (attr == kCommunication) {
+    if (attr == kCrossDevice) {
      return communication_allocator_->AllocateShared(size, attr);
    } else {
      return normal_allocator_->AllocateShared(size, attr);

--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -24,6 +24,10 @@ namespace allocation {
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.
+//
+// NOTE(yy): This class is a singleton class.
+// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
+// a Pimpl trick;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
 public:
@@ -33,13 +37,16 @@ class AllocatorFacade {

  static AllocatorFacade& Instance();

+  // Allocate a shared allocation.
  std::shared_ptr<Allocation> AllocShared(
      const platform::Place& place, size_t size,
      Allocator::Attr attr = Allocator::kDefault);

+  // Allocate a unique allocation.
  std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
                                    Allocator::Attr attr = Allocator::kDefault);

+  // TODO(yy): Allocate a Copy-On-Write allocation?
 private:
  AllocatorFacade();
  AllocatorFacadePrivate* m_;

--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -24,12 +24,27 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// The AutoIncrementAllocator manages many underlying allocators. If none of
+// them can allocate the request memory, a new allocator will be created and
+// invoke its `allocate` method.
+//
+// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
+// the latest sucessful allocator.
+//
+// NOTE(yy): We may need to release an underlying allocator if it allocate
+// nothing. However, it is generally not useful, since it will make performance
+// undetermined.
+//
+// NOTE(yy): This allocator is only locked when creating new underlying
+// allocator. The allocation requests from many threads may be dispatched
+// to the same underlying allocator. So the underlying allocator must be
+// thread safe.
 class AutoIncrementAllocator : public ManagedAllocator {
 public:
+  // Creator is the method to create ManagedAllocator
  using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>;

-  template <typename Creator>
-  explicit AutoIncrementAllocator(Creator&& creator)
+  explicit AutoIncrementAllocator(AllocatorCreator&& creator)
      : creator_(std::move(creator)), prev_success_allocator_{0} {}
  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
@@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator {
      std::lock_guard<std::mutex> guard(mtx_);
      underlying_allocators_.emplace_back(creator_());
      prev_success_allocator_ = underlying_allocators_.size() - 1;
+      PADDLE_ENFORCE(
+          underlying_allocators_[prev_success_allocator_]->IsAllocThreadSafe(),
+          "the underlying allocator must be thread safe. This is a program "
+          "bug.");
+
      return callback(*underlying_allocators_[prev_success_allocator_]);
    }
  }

--- a/paddle/fluid/memory/allocation/conditional_allocator.h
+++ b/paddle/fluid/memory/allocation/conditional_allocator.h
@@ -22,6 +22,22 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// A composite allocator who will dispatch the allocation request by registered
+// condition.
+//
+// For example:
+//
+// auto* cond_allocator = new ConditionalAllocator();
+// cond_allocator->AddAllocator([](size_t size, Attr attr){
+//   // if size > 10
+//   return size > 10;
+// }, allocator_a).AddAllocator([](size_t size, Attr attr){
+//   // elif attr is kDefault
+//   return attr == kDefault;
+// }, allocator_b).AddAllocator([](size_t size, Attr attr){
+//   // else
+//   return true;
+// }, allocator_c);
 class ConditionalAllocator : public ManagedAllocator {
 public:
  ConditionalAllocator() = default;

--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -18,7 +18,13 @@
 namespace paddle {
 namespace memory {
 namespace allocation {
-
+// CPU system allocator and allocation.
+//
+// NOTE(yy): Should we just use `malloc` here since there is an
+// aligned_allocator.
+//
+// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
+// an open-sourced allocator into Paddle.
 class CPUAllocation : public Allocation {
 public:
  CPUAllocation(void* ptr, size_t size)

--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// CUDA System allocator and allocation.
 // Just a flag type.
 class CUDAAllocation : public Allocation {
 public:

--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// A allocator to make underlying allocator thread safe.
 class LockedAllocator : public UnmanagedAllocator {
 public:
  explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);

--- a/paddle/fluid/memory/allocation/naive_managed_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_managed_allocator.h
@@ -20,6 +20,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// An allocator to wrap an UnmanagedAllocator and make the allocation managed
+// by C++ smart ptr.
+//
+// NOTE: if the NaiveManagedAllocator is destroyed before
+// NaiveManagedAllocations, the allocation will never be released.
 class NaiveManagedAllocator;
 class NaiveManagedAllocation : public Allocation {
 public:

--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -23,7 +23,7 @@ namespace allocation {
 std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
                                                         Allocator::Attr attr) {
  PADDLE_ENFORCE_EQ(
-      attr, kCommunication,
+      attr, kCrossDevice,
      "CPUPinnedAllocator should be used for Cross-Device Communication");

  void* ptr;

--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -19,6 +19,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// Allocator uses `cudaMallocHost`
 class CPUPinnedAllocation : public Allocation {
 public:
  CPUPinnedAllocation(void* ptr, size_t size)

--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+// The allocator handles the request's size is zero. Allocator will always
+// return an allocation even the request size is zero. However, the
+// allocation.ptr() is nullptr
 class ZeroSizeAllocation : public Allocation {
 public:
  explicit ZeroSizeAllocation(const platform::Place& p)

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
 #include <string>
 #include <vector>
@@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
  // Allocate temporary storage
  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
  auto d_temp_storage =
-      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp);
+      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);

  // Run sorting operation
  cub::DeviceRadixSort::SortPairsDescending<T, int>(

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  }

  void* allocate(size_t num_bytes) const override {
-    auto buf =
-        paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny);
+    auto buf = paddle::memory::Alloc(place_, num_bytes,
+                                     memory::Allocator::kScratchpad);
    void* retv = buf->ptr();
    allocations_[buf->ptr()] = std::move(buf);
    return retv;

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -64,7 +64,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace(),
-            memory::Allocator::kCommunication));
+            memory::Allocator::kCrossDevice));

        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
                                        sizeof(CUR_TYPE) * tensor.numel(),