diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 0b9545ad0b3835fe2f6f4b346e20ef0d87facf82..062be5121e2087975d2ae617c1291a92d41b4187 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -15,6 +15,7 @@ #include #include #include +#include "../memory/allocation/allocator.h" #include "paddle/fluid/framework/data_type.h" namespace paddle { @@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); - auto dst_ptr = dst->mutable_data(dst_place, src.type(), - memory::Allocator::kCommunication); + auto dst_ptr = + dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index a805e19bc9f78da7b6d0baee3fa0d80a2a8de024..98b4b035861fb3bfe1531ecbf780aef395789606 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -21,6 +21,11 @@ namespace allocation { ThinAlignedAllocator::ThinAlignedAllocator( std::shared_ptr underlyning_allocator) : underlying_allocator_(std::move(underlyning_allocator)) {} + +std::shared_ptr ThinAlignedAllocator::AllocateShared( + size_t size, Allocator::Attr attr) { + return std::shared_ptr(Allocate(size, attr).release()); +} } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index d9eb7870c9b695959492e7edcfaed73795cf4402..3a7868f403e008265ac2fe100193a576962829a0 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -20,34 +20,66 @@ namespace paddle { namespace memory { namespace allocation { +// The aligned allocation and allocator will wrap a managed allocator, +// and returns the aligned pointer. +// +// NOTE(yy): For speed reason, I just use a template parameter to get +// alignment, however, it can be an private member if necessary. +// +// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added. template class AlignedAllocation : public Allocation { public: AlignedAllocation(std::unique_ptr&& underlying_allocation, size_t size) - : Allocation(AlignedPtr(underlying_allocation->ptr()), size, + : Allocation(AlignedPtr(underlying_allocation->ptr()), + size + kAlignment - Offset(underlying_allocation->ptr()), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)) {} private: static void* AlignedPtr(void* ptr) { - auto ptr_addr = reinterpret_cast(ptr); - ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment; - return reinterpret_cast(ptr_addr); + return reinterpret_cast(reinterpret_cast(ptr) + + Offset(ptr)); + } + + // Offset to aligned pointer. + // if ptr is already aligned, returns 0. + static size_t Offset(void* ptr) { + auto ptr_addr = reinterpret_cast(ptr); + intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1)); + intptr_t diff = aligned_addr - ptr_addr; + if (diff == 0) { + return 0; + } else { + return kAlignment + diff; + } } std::unique_ptr underlying_allocation_; }; +// Thin aligned allocator is trivial and used to generate a small size binary. +// +// NOTE(yy): This is a trick to make a template class. This class extract the +// common code into a `thin` class. So if there are multiple specification of +// the template class, the binary size will not extended too much. +// +// NOTE(yy): This could be an over design. If it harms readability of code, it +// could be removed later. class ThinAlignedAllocator : public ManagedAllocator { public: explicit ThinAlignedAllocator( std::shared_ptr underlyning_allocator); + std::shared_ptr AllocateShared(size_t size, Attr attr) override; + protected: std::shared_ptr underlying_allocator_; }; +// An aligned allocator will allocate `size+kAlignment` allocation and adjust +// the pointer offset. template class AlignedAllocator : public ThinAlignedAllocator { public: @@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator { return std::unique_ptr( new AlignedAllocation(std::move(raw_allocation), size)); } - std::shared_ptr AllocateShared(size_t size, Attr attr) override { - return std::shared_ptr(Allocate(size, attr).release()); - } }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu index e4d690c296cfe9aa273c9b94688b44ef62bf5e97..b61649e59d326a64aa806460feffc3a910b1cab8 100644 --- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu +++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu @@ -18,6 +18,9 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" #include "unsupported/Eigen/CXX11/Tensor" + +// NOTE(yy): this unittest is not important. It just used for debugging. +// It can be removed later. struct FillZero { public: float* ptr_; diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 1ee80a3b40e449615bcab10c1e05920215ebda38..e117a2d1537a899e3d0fe990e2aece38c1cfbd63 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -12,6 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include #include @@ -21,15 +37,22 @@ namespace paddle { namespace memory { namespace allocation { +// Exception when `Alloc`/`AllocShared` failed class BadAlloc : public std::exception { public: - explicit BadAlloc(const std::string& msg) : msg_(msg) {} + explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {} const char* what() const noexcept override; private: std::string msg_; }; +// Allocation is the object holding the actually pointer. Use +// `Allocation::ptr()` will returns the pointer that allocated. +// +// NOTE: this is the base class of Allocation. Each allocator can use its own +// allocation object. +// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 class Allocation { public: Allocation(void* ptr, size_t size, platform::Place place) @@ -38,8 +61,22 @@ class Allocation { Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; + // Returns the holding pointer. + // NOTE: For performance consideration, it is better not to make this method + // as a virtual method. If we want to implement a `defragmentation` later, + // we might need to make `ptr_` field as a protected field, and add a virtual + // method like `defragmentation` to change `ptr_`. void* ptr() const { return ptr_; } + // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the + // last valid element. + // + // NOTE: Some allocator might alloc more memory than request. The size + // could larger than its request. For example, + // the AlignedAllocator will always allocate memory as size + kAlignment. + // The raw pointer might not aligned, so an offset might be added to raw + // the pointer. The size of this allocation will be + // `size + kAlignemnt - offset`. size_t size() const { return size_; } const platform::Place& place() const { return place_; } @@ -52,22 +89,51 @@ class Allocation { platform::Place place_; }; +// Base interface class of memory Allocator. +// To allocate a memory, allocator needs two parameters: +// 1. size of bytes. +// 2. Attribute of memory. +// NOTE: the attribute of memory might be ignored if the allocator does not +// care it. class Allocator { public: enum Attr { - kDefault = 0, - kTiny = 1, - kFixedHuge = 2, - kFluxHuge = 3, - kTmp = 4, - kCommunication = 5, - NumOfAttrs = 6 + kDefault = 0, // Default attribute. Uses the fast or stablest allocation + // algorithm. + + kFixedHuge = 1, // The allocation may not be freed until the program + // ends. e.g., `Parameters` and `Momentum`. + + kFluxHuge = 2, // The allocation may create and freed frequently and the + // allocation is considerable huge. Like `activations` + // and gradients. + + kScratchpad = + 3, // The `Scratchpad` memory is allocated and freed very soon, + // usually within an operator or aux memory. + // Like CUDNN workspace, AUX memory in batch norm, etc. + // + // https://en.wikipedia.org/wiki/Scratchpad_memory + + kCrossDevice = + 4, // The memory used cross-device memory copy/communication. + // For example: + // 1. it can use an `pinned` memory for CPU-GPU + // communication. + // 2. it can use an `registered` memory for RDMA + // communication. + + NumOfAttrs = 5 // The number of all attributes. It is used internally. }; virtual ~Allocator(); + + // Allocate an allocation. Note the return allocation might need to be freed + // manually if the Allocator is an `UnmanagedAllocator`. virtual std::unique_ptr Allocate( size_t size, Allocator::Attr attr = kDefault) = 0; + // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; }; @@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator { } }; -// The allocation will be managed by smart pointers +// The allocation will be managed by smart pointers. i.e., users do not need +// to free allocation manually. class ManagedAllocator : public Allocator { public: virtual std::shared_ptr AllocateShared( diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7816aec8f788e536293f77f21cbc22ba7bcbebd5..052e1646de68bdbdb803b2ad41f3e44a70859bed 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator { std::unique_ptr(new CPUPinnedAllocator()))) {} std::unique_ptr Allocate(size_t size, Attr attr) override { - if (attr == kCommunication) { + if (attr == kCrossDevice) { return communication_allocator_->Allocate(size, attr); } else { return normal_allocator_->Allocate(size, attr); @@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator { } std::shared_ptr AllocateShared(size_t size, Attr attr) override { - if (attr == kCommunication) { + if (attr == kCrossDevice) { return communication_allocator_->AllocateShared(size, attr); } else { return normal_allocator_->AllocateShared(size, attr); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index a910e40badb80476ecd3d7761f44fc1d79b73982..c03d59a3f3c16f916a7c23e86fa6e3a2abf05efe 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -24,6 +24,10 @@ namespace allocation { // Allocator Facade is the interface exposed to other modules. // All the configuration or dirty code under development should // be hidden behind this facade. +// +// NOTE(yy): This class is a singleton class. +// NOTE(yy): To create a stable ABI and make compilation faster. Here we use +// a Pimpl trick; class AllocatorFacadePrivate; class AllocatorFacade { public: @@ -33,13 +37,16 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + // Allocate a shared allocation. std::shared_ptr AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr = Allocator::kDefault); + // Allocate a unique allocation. std::unique_ptr Alloc(const platform::Place& place, size_t size, Allocator::Attr attr = Allocator::kDefault); + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); AllocatorFacadePrivate* m_; diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h index 9fe370b08a79ff6bf9e359ffa78b45e02ce8c89b..116d4ca6892fdceba0dda6da8a5c6039ac24ebb5 100644 --- a/paddle/fluid/memory/allocation/auto_increment_allocator.h +++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h @@ -24,12 +24,27 @@ namespace paddle { namespace memory { namespace allocation { +// The AutoIncrementAllocator manages many underlying allocators. If none of +// them can allocate the request memory, a new allocator will be created and +// invoke its `allocate` method. +// +// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from +// the latest sucessful allocator. +// +// NOTE(yy): We may need to release an underlying allocator if it allocate +// nothing. However, it is generally not useful, since it will make performance +// undetermined. +// +// NOTE(yy): This allocator is only locked when creating new underlying +// allocator. The allocation requests from many threads may be dispatched +// to the same underlying allocator. So the underlying allocator must be +// thread safe. class AutoIncrementAllocator : public ManagedAllocator { public: + // Creator is the method to create ManagedAllocator using AllocatorCreator = std::function()>; - template - explicit AutoIncrementAllocator(Creator&& creator) + explicit AutoIncrementAllocator(AllocatorCreator&& creator) : creator_(std::move(creator)), prev_success_allocator_{0} {} std::unique_ptr Allocate(size_t size, Attr attr) override; std::shared_ptr AllocateShared(size_t size, Attr attr) override; @@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator { std::lock_guard guard(mtx_); underlying_allocators_.emplace_back(creator_()); prev_success_allocator_ = underlying_allocators_.size() - 1; + PADDLE_ENFORCE( + underlying_allocators_[prev_success_allocator_]->IsAllocThreadSafe(), + "the underlying allocator must be thread safe. This is a program " + "bug."); + return callback(*underlying_allocators_[prev_success_allocator_]); } } diff --git a/paddle/fluid/memory/allocation/conditional_allocator.h b/paddle/fluid/memory/allocation/conditional_allocator.h index f993857c79400115ecef29af164877ba830c50b3..46af1099a5c9328aa1ec5e92e6655f277cdd8e93 100644 --- a/paddle/fluid/memory/allocation/conditional_allocator.h +++ b/paddle/fluid/memory/allocation/conditional_allocator.h @@ -22,6 +22,22 @@ namespace paddle { namespace memory { namespace allocation { +// A composite allocator who will dispatch the allocation request by registered +// condition. +// +// For example: +// +// auto* cond_allocator = new ConditionalAllocator(); +// cond_allocator->AddAllocator([](size_t size, Attr attr){ +// // if size > 10 +// return size > 10; +// }, allocator_a).AddAllocator([](size_t size, Attr attr){ +// // elif attr is kDefault +// return attr == kDefault; +// }, allocator_b).AddAllocator([](size_t size, Attr attr){ +// // else +// return true; +// }, allocator_c); class ConditionalAllocator : public ManagedAllocator { public: ConditionalAllocator() = default; diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index e3f35685d7e89372ecd7cc0373e0cc2dffd755dc..b2df77f1227c658e6ba83075fbc0f46340305334 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -18,7 +18,13 @@ namespace paddle { namespace memory { namespace allocation { - +// CPU system allocator and allocation. +// +// NOTE(yy): Should we just use `malloc` here since there is an +// aligned_allocator. +// +// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import +// an open-sourced allocator into Paddle. class CPUAllocation : public Allocation { public: CPUAllocation(void* ptr, size_t size) diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 4bd4c00f976ba7058766d982879814783807085b..dea01e60890741877a387e5588fae8703dd202ac 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// CUDA System allocator and allocation. // Just a flag type. class CUDAAllocation : public Allocation { public: diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index eed263f3bc50c2e2974ea1b1497c3b67c51ed7de..f092a5bad007ee6d1081c22ffacdbb6190bd4a73 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// A allocator to make underlying allocator thread safe. class LockedAllocator : public UnmanagedAllocator { public: explicit LockedAllocator(std::unique_ptr&& underlying_allocator); diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.h b/paddle/fluid/memory/allocation/naive_managed_allocator.h index 3291eeaadb6fcd916290bdde2488c4388117df00..7a4cfdb662a3b6481b1f0071621588ac07471e84 100644 --- a/paddle/fluid/memory/allocation/naive_managed_allocator.h +++ b/paddle/fluid/memory/allocation/naive_managed_allocator.h @@ -20,6 +20,11 @@ namespace paddle { namespace memory { namespace allocation { +// An allocator to wrap an UnmanagedAllocator and make the allocation managed +// by C++ smart ptr. +// +// NOTE: if the NaiveManagedAllocator is destroyed before +// NaiveManagedAllocations, the allocation will never be released. class NaiveManagedAllocator; class NaiveManagedAllocation : public Allocation { public: diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 39f4b78421592d9916db192ffc0be1b2b59c7dfc..dd1f5a3dd0f70b0603b4bd89db54c2fa9373740b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -23,7 +23,7 @@ namespace allocation { std::unique_ptr CPUPinnedAllocator::Allocate(size_t size, Allocator::Attr attr) { PADDLE_ENFORCE_EQ( - attr, kCommunication, + attr, kCrossDevice, "CPUPinnedAllocator should be used for Cross-Device Communication"); void* ptr; diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index eb249192dd016dcd1405f65083e9310d32172a57..2c9e09cd721bea969fceaa307bcf3ef93be6568c 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -19,6 +19,7 @@ namespace paddle { namespace memory { namespace allocation { +// Allocator uses `cudaMallocHost` class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void* ptr, size_t size) diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index 62e14b633cc3f36b25f72f3073f1fd9091d6a4e2..35a4552469f2d3c50fc0df8fab50d99c6bdc08f5 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -22,6 +22,9 @@ namespace paddle { namespace memory { namespace allocation { +// The allocator handles the request's size is zero. Allocator will always +// return an allocation even the request size is zero. However, the +// allocation.ptr() is nullptr class ZeroSizeAllocation : public Allocation { public: explicit ZeroSizeAllocation(const platform::Place& p) diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 3b9303b7e35696bec7d6cc098873162a55736c4b..0d3817c3e7c43bf7772f16ebc45841ec5ef9a62a 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include @@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Allocate temporary storage auto place = boost::get(ctx.GetPlace()); auto d_temp_storage = - memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp); + memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad); // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 80ffc680c2a26426dea2e96a5e285abe38543c79..6b1d5e297dd3b5a41d63e61d02afd367859d1431 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } void* allocate(size_t num_bytes) const override { - auto buf = - paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny); + auto buf = paddle::memory::Alloc(place_, num_bytes, + memory::Allocator::kScratchpad); void* retv = buf->ptr(); allocations_[buf->ptr()] = std::move(buf); return retv; diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 1b95ec66bd514e23001ea5e06b6b867749e16eb8..e55f734e45b6b40741386979324ac88b1b989fa1 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -64,7 +64,7 @@ struct CastToPyBufferImpl { auto *src_ptr = static_cast(tensor.data()); auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace(), - memory::Allocator::kCommunication)); + memory::Allocator::kCrossDevice)); paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),