未验证 提交 6d353aa5 编写于 作者: L Leo Chen 提交者: GitHub

refine auto_growth allocator (#35732)

* do not use alignedAllocator when cuda has alignment

* update test

* fix error during multiple process
上级 e5b4dd73
...@@ -20,6 +20,7 @@ namespace paddle { ...@@ -20,6 +20,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
// For memory address alignment
class AlignedAllocation : public Allocation { class AlignedAllocation : public Allocation {
public: public:
AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif #endif
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -201,6 +202,8 @@ class AllocatorFacadePrivate { ...@@ -201,6 +202,8 @@ class AllocatorFacadePrivate {
inline const std::shared_ptr<Allocator>& GetAllocator( inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) { const platform::Place& place, size_t size) {
VLOG(4) << "GetAllocator"
<< " " << place << " " << size;
const auto& allocators = const auto& allocators =
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: GetAllocatorMap()) : GetAllocatorMap())
...@@ -256,8 +259,39 @@ class AllocatorFacadePrivate { ...@@ -256,8 +259,39 @@ class AllocatorFacadePrivate {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) { bool allow_free_idle_chunk) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p); auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
// NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
// API in that case may got cuda error(3), i.e.,
// cudaErrorInitializationError. And, the CUDAAllocator is only initialized
// but not really used.
// Here, the try-catch block is added to handle the case that
// GetDeviceProperties() may failed in the multiple process(for example, in
// dataloader with num_worker > 0)
try {
const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
need_addr_align = prop.textureAlignment < alignment;
VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
<< prop.textureAlignment
<< ", set need_addr_align=" << need_addr_align;
} catch (...) {
need_addr_align = true;
VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
}
// The address returned is aligned already,
// ref:
// https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
std::shared_ptr<Allocator> underlying_allocator{nullptr};
if (need_addr_align) {
VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
underlying_allocator =
std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
} else {
VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
underlying_allocator = cuda_allocator;
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>( allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); underlying_allocator, alignment, 0, allow_free_idle_chunk);
} }
#endif #endif
......
...@@ -40,14 +40,14 @@ namespace allocation { ...@@ -40,14 +40,14 @@ namespace allocation {
AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment, const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size, bool allow_free_idle_chunk) size_t chunk_size, bool allow_free_idle_chunk)
: underlying_allocator_( : underlying_allocator_(underlying_allocator),
std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
alignment_(alignment), alignment_(alignment),
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
allow_free_idle_chunk_(allow_free_idle_chunk) {} allow_free_idle_chunk_(allow_free_idle_chunk) {}
Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
size = AlignedSize(size, alignment_); size_t size = AlignedSize(unaligned_size, alignment_);
VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
std::lock_guard<SpinLock> guard(spinlock_); std::lock_guard<SpinLock> guard(spinlock_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
...@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { ...@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
free_blocks_.erase(iter); free_blocks_.erase(iter);
auto *chunk = block_it->chunk_; auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size; size_t remaining_size = block_it->size_ - size;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << ", remaining " << remaining_size;
if (remaining_size == 0) { if (remaining_size == 0) {
block_it->is_free_ = false; block_it->is_free_ = false;
} else { } else {
...@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { ...@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
} }
blocks.emplace_back(p + remaining_size, size, false, chunk); blocks.emplace_back(p + remaining_size, size, false, chunk);
block_it = --(blocks.end()); block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " VLOG(2) << "Not found and reallocate " << realloc_size << "("
<< remaining_size; << static_cast<void *>(p) << "), and remaining " << remaining_size;
} }
return new BlockAllocation(block_it); return new BlockAllocation(block_it);
} }
void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
VLOG(10) << "Free " << allocation->size() << " bytes";
std::lock_guard<SpinLock> guard(spinlock_); std::lock_guard<SpinLock> guard(spinlock_);
auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_; auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_; auto &blocks = block_it->chunk_->blocks_;
......
...@@ -12,10 +12,11 @@ ...@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include <cstdlib> #include <cstdlib>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
DECLARE_bool(free_idle_chunk); DECLARE_bool(free_idle_chunk);
...@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, ...@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_idle_chunk = free_idle_chunk;
FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
auto recorded_allocator = std::make_shared<RecordedAllocator>(); auto recorded_allocator = std::make_shared<RecordedAllocator>();
size_t alignment = 4096; size_t alignment = 4096;
size_t memory_size = 8192; size_t memory_size = 8192;
auto underlying_allocator =
std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>( auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
recorded_allocator, alignment); underlying_allocator, alignment);
for (size_t i = 0; i < 10; ++i) { for (size_t i = 0; i < 10; ++i) {
auto allocation = ag_allocator->Allocate(memory_size); auto allocation = ag_allocator->Allocate(memory_size);
...@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { ...@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
auto underlying_allocator = auto underlying_allocator =
std::make_shared<LimitedResourceAllocator>(memory_capacity); std::make_shared<LimitedResourceAllocator>(memory_capacity);
auto aligned_allocator =
std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>( auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment); aligned_allocator, alignment);
ag_allocator->Allocate(allocate_size[0]); ag_allocator->Allocate(allocate_size[0]);
ASSERT_EQ(underlying_allocator->AllocatedSize(), ASSERT_EQ(underlying_allocator->AllocatedSize(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册