未验证 提交 69cb9792 编写于 作者: Z Zeng Jinle 提交者: GitHub

Merge pull request #16506 from sneaxiy/revert-16424-fix_allocator_bug

Revert "Fix allocator bug"
......@@ -365,6 +365,9 @@ class ExecutionContext {
auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_GE(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
......
......@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU)
......@@ -38,20 +37,30 @@ else ()
set(AllocatorFacadeDeps)
endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
allocator_strategy
legacy_allocator
)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
......@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
underlying_allocator_->Allocate(size + kAlignment, attr);
return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
}
void FreeImpl(Allocation* allocation) override { delete allocation; }
};
} // namespace allocation
......
......@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr);
ptr->RegisterDecoratedAllocator(this);
ptr->set_allocator(this);
return AllocationPtr(ptr);
}
void Allocator::FreeImpl(Allocation* allocation) {
Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation);
}
void Allocator::Free(Allocation* allocation) {
allocation->PopDecoratedAllocator();
FreeImpl(allocation);
}
void Allocator::Free(Allocation* allocation) { delete allocation; }
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const {
Allocator* allocator = allocation->TopDecoratedAllocator();
auto* allocator = allocation->allocator();
allocator->Free(allocation);
}
......
......@@ -46,56 +46,13 @@ class Allocator;
// NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
/**
* Allocation is returned by Allocator::Allocate() method.
*
* An allocator may be decorated by another allocator. For example, we can
* decorate
* a RetryAllocator to any allocator to perform allocation retry when first
* allocation request fails.
*
* Explanations of Allocator design is as follows:
*
* Suppose we have an allocator which is decorated by several allocators:
*
* A(1) <- A(2) <- A(3) <- ... <- A(n)
*
* , and the public allocator is A(1).
*
* The allocation process would be:
*
* A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
*
* , and the free process would be:
*
* A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
*
* Therefore, we should record the allocator chain when allocating, so
* that we can free the allocation in the reverse order of allocator chain.
* The field `decorated_allocators_` is used to record this chain.
*
* Another example is that we want to add additional fields in Allocation,
* e.g., something what is done in AlignedAllocator, etc.
* In this case, we should declare a derived class of Allocation, which
* contains an underlying Allocation allocated by the underlying allocator.
* Therefore, `decorated_allocators_` of the new Allocation object would
* be a new chain, differing from the underlying Allocation object.
*/
class Allocation {
public:
Allocation(void* ptr, size_t size, platform::Place place)
: ptr_(ptr), size_(size), place_(place) {
// NOTE(zjl): Since decorated_allocators_ is usually a small vector
// We reserve a small buffer to it to prevent frequent heap allocation
// Not quite sure whether we need something like gtl vector.
decorated_allocators_.reserve(8);
}
: allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
Allocation(Allocation&& o) = delete;
Allocation& operator=(Allocation&& o) = delete;
// Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method
......@@ -117,31 +74,17 @@ class Allocation {
const platform::Place& place() const { return place_; }
virtual ~Allocation();
private:
const std::vector<Allocator*>& DecoratedAllocators() const {
return decorated_allocators_;
}
inline void RegisterDecoratedAllocator(Allocator* allocator) {
decorated_allocators_.push_back(allocator);
}
Allocator* allocator() { return allocator_; }
inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
void set_allocator(Allocator* allocator) { allocator_ = allocator; }
inline Allocator* TopDecoratedAllocator() {
return decorated_allocators_.back();
}
virtual ~Allocation();
private:
Allocator* allocator_;
void* ptr_;
size_t size_;
platform::Place place_;
std::vector<Allocator*> decorated_allocators_;
friend class Allocator;
friend class AllocationDeleter;
};
using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
......@@ -191,12 +134,9 @@ class Allocator {
// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const;
// This function should not be called outside
void Free(Allocation* allocation);
protected:
virtual void Free(Allocation* allocation);
virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
virtual void FreeImpl(Allocation* allocation);
private:
friend class AllocationDeleter;
......
......@@ -49,17 +49,6 @@ namespace paddle {
namespace memory {
namespace allocation {
static inline std::shared_ptr<Allocator> WrapRetryAllocator(
std::shared_ptr<Allocator> allocator, int64_t retry_time) {
if (retry_time > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time);
allocator.reset(retry_allocator);
}
return allocator;
}
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator {
public:
......@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator {
std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
auto* allocation = chunks_.back().get();
std::shared_ptr<Allocator> allocator(new LockedAllocator(
std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
std::unique_ptr<Allocator> allocator(new LockedAllocator(
std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
allocator = WrapRetryAllocator(allocator, retry_time_);
if (retry_time_ > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
}
......@@ -197,23 +190,13 @@ class AllocatorFacadePrivate {
~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
case AllocatorStrategy::kLegacy: {
InitLegacyAllocator();
break;
}
case AllocatorStrategy::kNaiveBestFit: {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
break;
}
default: {
PADDLE_THROW("Unsupported allocator strategy: %d",
static_cast<int>(strategy));
}
if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
InitLegacyAllocator();
} else {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
}
}
......@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() {
std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr));
return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
AllocationDeleter());
}
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
......
......@@ -19,22 +19,16 @@
DEFINE_string(
allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid."
"naive_best_fit means the experimental best fit allocator. "
"allocator. Enum in [legacy, naive_best_fit].");
"New means the experimental allocators of Fluid. in [legacy, new]");
namespace paddle {
namespace memory {
namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "legacy") {
return AllocatorStrategy::kLegacy;
} else if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit;
} else {
PADDLE_THROW("Unsupported allocator strategy: %s",
FLAGS_allocator_strategy);
}
return FLAGS_allocator_strategy == "legacy"
? AllocatorStrategy::kLegacy
: AllocatorStrategy::kNaiveBestFit;
}
AllocatorStrategy GetAllocatorStrategy() {
......
......@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
}
return num;
}
void BestFitAllocator::FreeImpl(Allocation* allocation) {
void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(bf_allocation,
"The input allocation is not BestFitAllocation.");
......
......@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
void InsertFreeNode(const ListIt& it);
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
......
......@@ -22,11 +22,11 @@ namespace paddle {
namespace memory {
namespace allocation {
BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
: underlying_allocator_(std::move(allocator)) {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_,
"Underlying allocator of BufferedAllocator must not be null");
"Underlying allocator of BufferedAllocator must be unmanaged");
if (underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
......@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
while (!allocations_.empty()) { // free the largest
auto it = --allocations_.end();
cur += it->second->size();
underlying_allocator_->Free(it->second.release());
delete it->second.release();
allocations_.erase(it);
if (cur >= size) return;
}
}
bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
void BufferedAllocator::FreeImpl(Allocation *allocation) {
bool BufferedAllocator::IsAllocThreadSafe() const {
return this->underlying_allocator_->IsAllocThreadSafe();
}
void BufferedAllocator::Free(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
allocations_.emplace(allocation->size(), AllocationPtr(allocation));
}
Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
......@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (it != allocations_.end() && it->first < size * 2) {
AllocationPtr result(std::move(it->second));
allocations_.erase(it);
return result.release();
return new AllocationWithUnderlying(std::move(result));
}
}
try {
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} catch (BadAlloc &) {
FreeCache(size);
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
}
......
......@@ -31,7 +31,7 @@ namespace allocation {
// underlying_allocator_
class BufferedAllocator : public Allocator {
public:
explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
~BufferedAllocator();
......@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
void FreeCache(size_t size);
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::multimap<size_t, AllocationPtr> allocations_;
std::unique_ptr<std::mutex> mtx_;
};
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
......@@ -65,7 +66,7 @@ class StubAllocator : public Allocator {
size_t GetFreeCount() const { return destruct_count_; }
protected:
void FreeImpl(Allocation *allocation) override {
void Free(Allocation *allocation) override {
auto *alloc = dynamic_cast<StubAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(alloc);
if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
......
......@@ -20,27 +20,25 @@ namespace paddle {
namespace memory {
namespace allocation {
CPUAllocation::CPUAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
bool CPUAllocator::IsAllocThreadSafe() const { return true; }
void CPUAllocator::FreeImpl(Allocation *allocation) {
void *p = allocation->ptr();
#ifdef _WIN32
_aligned_free(p);
#else
free(p);
#endif
void CPUAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
free(allocation->ptr());
delete allocation;
}
Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *p;
#ifdef _WIN32
p = _aligned_malloc(size, kAlignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
size);
#endif
return new Allocation(p, size, platform::CPUPlace());
void *ptr;
auto status = posix_memalign(&ptr, kAlignment, size);
if (UNLIKELY(status) != 0) {
throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
size, status));
}
return new CPUAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
......
......@@ -31,13 +31,19 @@ namespace allocation {
//
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle.
class CPUAllocator;
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size);
};
class CPUAllocator : public Allocator {
public:
constexpr static size_t kAlignment = 4096UL;
constexpr static size_t kAlignment = 64u;
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
};
} // namespace allocation
......
......@@ -23,14 +23,15 @@ namespace paddle {
namespace memory {
namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::FreeImpl(Allocation* allocation) {
void CUDAAllocator::Free(Allocation* allocation) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
place_);
PADDLE_ENFORCE(cudaFree(allocation->ptr()));
delete allocation;
}
Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::CUDADeviceGuard guard(place_.device);
void* ptr;
......@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status)));
}
return new Allocation(ptr, size, platform::Place(place_));
return new CUDAAllocation(ptr, size, platform::Place(place_));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,6 +20,13 @@ namespace paddle {
namespace memory {
namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class CUDAAllocator : public Allocator {
public:
explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
......@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator {
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
......
......@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
}
#ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList {
public:
GPUBuddyAllocatorList()
: allocators_(platform::GetCUDADeviceCount()),
flags_(platform::GetCUDADeviceCount()) {
allocation::GPUMemMonitor.Initialize(allocators_.size());
}
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr;
static std::vector<int> devices;
std::call_once(init_flag, [gpu_id]() {
devices = platform::GetSelectedDevices();
int gpu_num = devices.size();
BuddyAllocator *Get(size_t dev_id) {
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] {
allocation::GPUMemMonitor.Initialize(devices.size());
a_arr = new BuddyAllocator *[gpu_num];
for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr;
platform::SetDeviceId(dev_id);
allocators_[dev_id] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
......@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList {
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[dev_id];
}
private:
std::vector<BuddyAllocator *> allocators_;
std::vector<std::once_flag> flags_;
};
}
});
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static GPUBuddyAllocatorList allocators;
platform::SetDeviceId(gpu_id);
return allocators.Get(gpu_id);
auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
}
#endif
......@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr && size > 0) {
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
......@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
return tmp_alloc;
}
void LegacyAllocator::FreeImpl(Allocation *allocation) {
void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
......
......@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
protected:
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
private:
platform::Place place_;
......
......@@ -17,7 +17,6 @@
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
namespace memory {
namespace allocation {
......@@ -25,24 +24,26 @@ namespace allocation {
bool LockedAllocator::IsAllocThreadSafe() const { return true; }
LockedAllocator::LockedAllocator(
std::shared_ptr<Allocator> underlying_allocator)
std::unique_ptr<Allocator> &&underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) {
PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
if (!underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
}
void LockedAllocator::FreeImpl(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
underlying_allocator_->Free(allocation);
void LockedAllocator::Free(Allocation *allocation) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
reinterpret_cast<AllocationWithUnderlying *>(allocation)
->allocation_.reset(); // Destroy inner allocation
}
delete allocation;
}
Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -24,15 +24,15 @@ namespace allocation {
// A allocator to make underlying allocator thread safe.
class LockedAllocator : public Allocator {
public:
explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::unique_ptr<std::mutex> mtx_;
};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "naive_best_fit";
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,15 +20,20 @@ namespace paddle {
namespace memory {
namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
void CPUPinnedAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
delete allocation;
}
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) {
// PADDLE_ENFORCE_EQ(
// attr, kCrossDevice,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void *ptr;
PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
return new Allocation(ptr, size, platform::CUDAPinnedPlace());
return new CPUPinnedAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
......
......@@ -20,12 +20,18 @@ namespace memory {
namespace allocation {
// Allocator uses `cudaHostAlloc`
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
};
class CPUPinnedAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
};
......
......@@ -18,15 +18,25 @@ namespace paddle {
namespace memory {
namespace allocation {
void RetryAllocator::FreeImpl(Allocation* allocation) {
bool RetryAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
void RetryAllocator::Free(Allocation* allocation) {
// Delete underlying allocation first.
underlying_allocator_->Free(allocation);
cv_.notify_all();
reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
{
// notify all waited allocators, they can try to allocate memory after free.
std::lock_guard<std::mutex> lock(mutex_);
cv_.notify_all();
}
delete allocation;
}
Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
auto alloc_func = [&, this]() {
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
};
// In fact, we can unify the code of allocation success and failure
// But it would add lock even when allocation success at the first time
......
......@@ -25,25 +25,32 @@ namespace paddle {
namespace memory {
namespace allocation {
class RetryAllocator;
class RetryAllocator : public Allocator {
public:
RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
: underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
EnforceCheck();
}
bool IsAllocThreadSafe() const override;
private:
void EnforceCheck() {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_,
"UnderlyingAllocator of RetryAllocator must not be null");
underlying_allocator_.get(),
"UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
"UnderlyingAllocator of RetryAllocator must be thread-safe");
}
bool IsAllocThreadSafe() const override { return true; }
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::chrono::milliseconds retry_time_;
std::mutex mutex_;
std::condition_variable cv_;
......@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
// For debug, We can add an atomic integer to record how many memory sizes are
// waited to allocate
// std::atomic<size_t> waited_allocate_size_{0};
friend class RetryAllocation;
};
} // namespace allocation
......
......@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new Allocation(nullptr, 0, place_);
return new ZeroSizeAllocation(place_);
} else {
return underlying_allocator_->Allocate(size, attr).release();
}
}
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (allocation->size() == 0) {
delete allocation;
} else {
underlying_allocator_->Free(allocation);
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -24,6 +24,12 @@ namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator {
public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
......@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator {
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation* allocation) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64(limit_of_tmp_allocation, -1,
......@@ -30,31 +31,38 @@ namespace paddle {
namespace platform {
namespace alloc = memory::allocation;
TemporaryAllocation::TemporaryAllocation(
alloc::AllocationPtr &&underlying_allocation)
: Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
}
bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
void TemporaryAllocator::Release(const std::function<void()> &callback) {
std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
{
std::unique_lock<std::mutex> lock(mtx_);
callback();
t_allocations.swap(temp_mem_map_);
temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
wait_delete_mem_ = 0;
}
alloc::AllocationDeleter deleter;
for (auto tmp : *t_allocations) {
VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
<< " size: " << tmp.second->size();
deleter(tmp.second);
delete tmp.second;
}
}
void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
void TemporaryAllocator::Free(alloc::Allocation *allocation) {
auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(temp_allocation);
if (platform::is_gpu_place(temp_allocation->place())) {
PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
"The place should be the same.");
......@@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
}
VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
<< " size: " << temp_allocation->size();
alloc::AllocationDeleter()(temp_allocation);
delete temp_allocation;
}
size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
......@@ -113,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
}
// If not find the the available allocation, get allocation from
// AllocatorFacadeInstance.
auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
auto raw_allocation =
alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
return temp_mem.release();
return temp_mem;
}
} // namespace platform
......
......@@ -23,6 +23,14 @@
namespace paddle {
namespace platform {
class TemporaryAllocation : public memory::allocation::Allocation {
public:
explicit TemporaryAllocation(
memory::allocation::AllocationPtr &&underlying_allocation);
memory::allocation::AllocationPtr underlying_allocation_;
};
/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
* which used by CUDA's async operation.
*
......@@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
void SetCallback(const std::function<void()> &callback);
protected:
void FreeImpl(memory::allocation::Allocation *allocation) override;
void Free(memory::allocation::Allocation *allocation) override;
memory::allocation::Allocation *AllocateImpl(
size_t size, memory::allocation::Allocator::Attr attr) override;
......@@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
platform::Place place_;
// When the allocation is not held by any variable, it should be placed
// to temp_mem_map immediately.
std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
temp_mem_map_{nullptr};
std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
nullptr};
std::mutex mtx_;
size_t wait_delete_mem_{0};
std::function<void()> callback_;
......
......@@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) {
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place);
})
.def("_clear", &Tensor::clear)
.def("set", PyCPUTensorSetFromArray<float>)
.def("set", PyCPUTensorSetFromArray<int>)
.def("set", PyCPUTensorSetFromArray<double>)
......
......@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) {
Fprintf(std::cout, fmt, args...);
}
inline std::string HumanReadableSize(double f_size) {
template <typename T>
std::string HumanReadableSize(T size) {
size_t i = 0;
double f_size = static_cast<double>(size);
double orig = f_size;
const std::vector<std::string> units(
{"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
while (f_size >= 1024) {
while (f_size > 1024) {
f_size /= 1024;
i++;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册