提交 15076c32 编写于 作者: Y Yu Yang

Add comments and polish code style

上级 b4f54d33
......@@ -15,6 +15,7 @@
#include <algorithm>
#include <limits>
#include <vector>
#include "../memory/allocation/allocator.h"
#include "paddle/fluid/framework/data_type.h"
namespace paddle {
......@@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type(),
memory::Allocator::kCommunication);
auto dst_ptr =
dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......
......@@ -21,6 +21,11 @@ namespace allocation {
ThinAlignedAllocator::ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator)
: underlying_allocator_(std::move(underlyning_allocator)) {}
std::shared_ptr<Allocation> ThinAlignedAllocator::AllocateShared(
size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Allocate(size, attr).release());
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,34 +20,66 @@ namespace paddle {
namespace memory {
namespace allocation {
// The aligned allocation and allocator will wrap a managed allocator,
// and returns the aligned pointer.
//
// NOTE(yy): For speed reason, I just use a template parameter to get
// alignment, however, it can be an private member if necessary.
//
// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
template <size_t kAlignment>
class AlignedAllocation : public Allocation {
public:
AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
size_t size)
: Allocation(AlignedPtr(underlying_allocation->ptr()), size,
: Allocation(AlignedPtr(underlying_allocation->ptr()),
size + kAlignment - Offset(underlying_allocation->ptr()),
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
private:
static void* AlignedPtr(void* ptr) {
auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment;
return reinterpret_cast<void*>(ptr_addr);
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
Offset(ptr));
}
// Offset to aligned pointer.
// if ptr is already aligned, returns 0.
static size_t Offset(void* ptr) {
auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
intptr_t diff = aligned_addr - ptr_addr;
if (diff == 0) {
return 0;
} else {
return kAlignment + diff;
}
}
std::unique_ptr<Allocation> underlying_allocation_;
};
// Thin aligned allocator is trivial and used to generate a small size binary.
//
// NOTE(yy): This is a trick to make a template class. This class extract the
// common code into a `thin` class. So if there are multiple specification of
// the template class, the binary size will not extended too much.
//
// NOTE(yy): This could be an over design. If it harms readability of code, it
// could be removed later.
class ThinAlignedAllocator : public ManagedAllocator {
public:
explicit ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator);
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
protected:
std::shared_ptr<ManagedAllocator> underlying_allocator_;
};
// An aligned allocator will allocate `size+kAlignment` allocation and adjust
// the pointer offset.
template <size_t kAlignment>
class AlignedAllocator : public ThinAlignedAllocator {
public:
......@@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
return std::unique_ptr<Allocation>(
new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
}
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
return std::shared_ptr<Allocation>(Allocate(size, attr).release());
}
};
} // namespace allocation
......
......@@ -18,6 +18,9 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#include "unsupported/Eigen/CXX11/Tensor"
// NOTE(yy): this unittest is not important. It just used for debugging.
// It can be removed later.
struct FillZero {
public:
float* ptr_;
......
......@@ -12,6 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <utility>
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
......@@ -21,15 +37,22 @@ namespace paddle {
namespace memory {
namespace allocation {
// Exception when `Alloc`/`AllocShared` failed
class BadAlloc : public std::exception {
public:
explicit BadAlloc(const std::string& msg) : msg_(msg) {}
explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
const char* what() const noexcept override;
private:
std::string msg_;
};
// Allocation is the object holding the actually pointer. Use
// `Allocation::ptr()` will returns the pointer that allocated.
//
// NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
class Allocation {
public:
Allocation(void* ptr, size_t size, platform::Place place)
......@@ -38,8 +61,22 @@ class Allocation {
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
// Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method
// as a virtual method. If we want to implement a `defragmentation` later,
// we might need to make `ptr_` field as a protected field, and add a virtual
// method like `defragmentation` to change `ptr_`.
void* ptr() const { return ptr_; }
// Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
// last valid element.
//
// NOTE: Some allocator might alloc more memory than request. The size
// could larger than its request. For example,
// the AlignedAllocator will always allocate memory as size + kAlignment.
// The raw pointer might not aligned, so an offset might be added to raw
// the pointer. The size of this allocation will be
// `size + kAlignemnt - offset`.
size_t size() const { return size_; }
const platform::Place& place() const { return place_; }
......@@ -52,22 +89,51 @@ class Allocation {
platform::Place place_;
};
// Base interface class of memory Allocator.
// To allocate a memory, allocator needs two parameters:
// 1. size of bytes.
// 2. Attribute of memory.
// NOTE: the attribute of memory might be ignored if the allocator does not
// care it.
class Allocator {
public:
enum Attr {
kDefault = 0,
kTiny = 1,
kFixedHuge = 2,
kFluxHuge = 3,
kTmp = 4,
kCommunication = 5,
NumOfAttrs = 6
kDefault = 0, // Default attribute. Uses the fast or stablest allocation
// algorithm.
kFixedHuge = 1, // The allocation may not be freed until the program
// ends. e.g., `Parameters` and `Momentum`.
kFluxHuge = 2, // The allocation may create and freed frequently and the
// allocation is considerable huge. Like `activations`
// and gradients.
kScratchpad =
3, // The `Scratchpad` memory is allocated and freed very soon,
// usually within an operator or aux memory.
// Like CUDNN workspace, AUX memory in batch norm, etc.
//
// https://en.wikipedia.org/wiki/Scratchpad_memory
kCrossDevice =
4, // The memory used cross-device memory copy/communication.
// For example:
// 1. it can use an `pinned` memory for CPU-GPU
// communication.
// 2. it can use an `registered` memory for RDMA
// communication.
NumOfAttrs = 5 // The number of all attributes. It is used internally.
};
virtual ~Allocator();
// Allocate an allocation. Note the return allocation might need to be freed
// manually if the Allocator is an `UnmanagedAllocator`.
virtual std::unique_ptr<Allocation> Allocate(
size_t size, Allocator::Attr attr = kDefault) = 0;
// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const;
};
......@@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator {
}
};
// The allocation will be managed by smart pointers
// The allocation will be managed by smart pointers. i.e., users do not need
// to free allocation manually.
class ManagedAllocator : public Allocator {
public:
virtual std::shared_ptr<Allocation> AllocateShared(
......
......@@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator {
std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
if (attr == kCommunication) {
if (attr == kCrossDevice) {
return communication_allocator_->Allocate(size, attr);
} else {
return normal_allocator_->Allocate(size, attr);
......@@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator {
}
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
if (attr == kCommunication) {
if (attr == kCrossDevice) {
return communication_allocator_->AllocateShared(size, attr);
} else {
return normal_allocator_->AllocateShared(size, attr);
......
......@@ -24,6 +24,10 @@ namespace allocation {
// Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should
// be hidden behind this facade.
//
// NOTE(yy): This class is a singleton class.
// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
// a Pimpl trick;
class AllocatorFacadePrivate;
class AllocatorFacade {
public:
......@@ -33,13 +37,16 @@ class AllocatorFacade {
static AllocatorFacade& Instance();
// Allocate a shared allocation.
std::shared_ptr<Allocation> AllocShared(
const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
// Allocate a unique allocation.
std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
AllocatorFacadePrivate* m_;
......
......@@ -24,12 +24,27 @@ namespace paddle {
namespace memory {
namespace allocation {
// The AutoIncrementAllocator manages many underlying allocators. If none of
// them can allocate the request memory, a new allocator will be created and
// invoke its `allocate` method.
//
// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
// the latest sucessful allocator.
//
// NOTE(yy): We may need to release an underlying allocator if it allocate
// nothing. However, it is generally not useful, since it will make performance
// undetermined.
//
// NOTE(yy): This allocator is only locked when creating new underlying
// allocator. The allocation requests from many threads may be dispatched
// to the same underlying allocator. So the underlying allocator must be
// thread safe.
class AutoIncrementAllocator : public ManagedAllocator {
public:
// Creator is the method to create ManagedAllocator
using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>;
template <typename Creator>
explicit AutoIncrementAllocator(Creator&& creator)
explicit AutoIncrementAllocator(AllocatorCreator&& creator)
: creator_(std::move(creator)), prev_success_allocator_{0} {}
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
......@@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator {
std::lock_guard<std::mutex> guard(mtx_);
underlying_allocators_.emplace_back(creator_());
prev_success_allocator_ = underlying_allocators_.size() - 1;
PADDLE_ENFORCE(
underlying_allocators_[prev_success_allocator_]->IsAllocThreadSafe(),
"the underlying allocator must be thread safe. This is a program "
"bug.");
return callback(*underlying_allocators_[prev_success_allocator_]);
}
}
......
......@@ -22,6 +22,22 @@ namespace paddle {
namespace memory {
namespace allocation {
// A composite allocator who will dispatch the allocation request by registered
// condition.
//
// For example:
//
// auto* cond_allocator = new ConditionalAllocator();
// cond_allocator->AddAllocator([](size_t size, Attr attr){
// // if size > 10
// return size > 10;
// }, allocator_a).AddAllocator([](size_t size, Attr attr){
// // elif attr is kDefault
// return attr == kDefault;
// }, allocator_b).AddAllocator([](size_t size, Attr attr){
// // else
// return true;
// }, allocator_c);
class ConditionalAllocator : public ManagedAllocator {
public:
ConditionalAllocator() = default;
......
......@@ -18,7 +18,13 @@
namespace paddle {
namespace memory {
namespace allocation {
// CPU system allocator and allocation.
//
// NOTE(yy): Should we just use `malloc` here since there is an
// aligned_allocator.
//
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle.
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size)
......
......@@ -20,6 +20,7 @@ namespace paddle {
namespace memory {
namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
......
......@@ -20,6 +20,7 @@ namespace paddle {
namespace memory {
namespace allocation {
// A allocator to make underlying allocator thread safe.
class LockedAllocator : public UnmanagedAllocator {
public:
explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);
......
......@@ -20,6 +20,11 @@ namespace paddle {
namespace memory {
namespace allocation {
// An allocator to wrap an UnmanagedAllocator and make the allocation managed
// by C++ smart ptr.
//
// NOTE: if the NaiveManagedAllocator is destroyed before
// NaiveManagedAllocations, the allocation will never be released.
class NaiveManagedAllocator;
class NaiveManagedAllocation : public Allocation {
public:
......
......@@ -23,7 +23,7 @@ namespace allocation {
std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
Allocator::Attr attr) {
PADDLE_ENFORCE_EQ(
attr, kCommunication,
attr, kCrossDevice,
"CPUPinnedAllocator should be used for Cross-Device Communication");
void* ptr;
......
......@@ -19,6 +19,7 @@ namespace paddle {
namespace memory {
namespace allocation {
// Allocator uses `cudaMallocHost`
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void* ptr, size_t size)
......
......@@ -22,6 +22,9 @@ namespace paddle {
namespace memory {
namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include <stdio.h>
#include <string>
#include <vector>
......@@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
auto d_temp_storage =
memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp);
memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(
......
......@@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
void* allocate(size_t num_bytes) const override {
auto buf =
paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny);
auto buf = paddle::memory::Alloc(place_, num_bytes,
memory::Allocator::kScratchpad);
void* retv = buf->ptr();
allocations_[buf->ptr()] = std::move(buf);
return retv;
......
......@@ -64,7 +64,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace(),
memory::Allocator::kCommunication));
memory::Allocator::kCrossDevice));
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册