未验证 提交 431a2d6a 编写于 作者: F From00 提交者: GitHub

Get base pointer from Allocation (#37978)

* Get GPU BasePtr from CUDA allocation

* Fix compile error for ROCm

* Add BasePtr function for IPUPlace in naive_best_fit_allocator.cc

* Add alignment for BuddyAllocator

* Set address alignment of BuddyAllocator to 32 bytes

* Fix CI error

* Remove code for naive_best_fit strategy
上级 b0d12d99
......@@ -125,3 +125,10 @@ if(NOT WIN32)
cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
endif(NOT WIN32)
if(WITH_GPU AND WITH_TESTING)
nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info)
set_tests_properties(base_ptr_test PROPERTIES
ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
FLAGS_use_stream_safe_cuda_allocator=true;")
endif()
......@@ -26,6 +26,7 @@ class AlignedAllocation : public Allocation {
AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
: Allocation(
reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
underlying_allocation->base_ptr(),
underlying_allocation->size() - offset,
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
......
......@@ -23,6 +23,8 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
......@@ -84,7 +86,10 @@ class Allocator;
class Allocation {
public:
inline Allocation(void* ptr, size_t size, platform::Place place)
: ptr_(ptr), size_(size), place_(place) {}
: ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
inline Allocation(void* ptr, void* base_ptr, size_t size,
platform::Place place)
: ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
......@@ -98,6 +103,15 @@ class Allocation {
// method like `defragmentation` to change `ptr_`.
inline void* ptr() const { return ptr_; }
inline void* base_ptr() const {
PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
paddle::platform::errors::Unimplemented(
"base_ptr() is only implemented for auto_growth "
"strategy, not support %s strategy",
FLAGS_allocator_strategy));
return base_ptr_;
}
// Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
// last valid element.
//
......@@ -126,6 +140,7 @@ class Allocation {
private:
void* ptr_;
void* base_ptr_; // the point that directly requested from system
size_t size_;
platform::Place place_;
......
......@@ -90,9 +90,9 @@ class CUDAGraphAllocator
public:
PrivateAllocation(CUDAGraphAllocator* allocator,
AllocationPtr underlying_allocation)
: Allocation(underlying_allocation->ptr(),
underlying_allocation->size(),
underlying_allocation->place()),
: Allocation(
underlying_allocation->ptr(), underlying_allocation->base_ptr(),
underlying_allocation->size(), underlying_allocation->place()),
allocator_(allocator->shared_from_this()),
underlying_allocation_(std::move(underlying_allocation)) {}
......
......@@ -73,7 +73,8 @@ class AutoGrowthBestFitAllocator : public Allocator {
struct BlockAllocation : public Allocation {
explicit BlockAllocation(const List<Block>::iterator &it)
: Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
: Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_,
it->chunk_->allocation_->place()),
block_it_(it) {}
List<Block>::iterator block_it_;
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <random>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
class CUDAAllocatoionBasePtrTest : public ::testing::Test {
public:
void SetUp() override {
place_ = platform::CUDAPlace();
alloc_times_ = 100;
batch_size_ = 10;
max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_;
random_engine_ = std::default_random_engine(time(NULL));
dis_ = std::uniform_int_distribution<int>(0, max_alloc_size_);
}
void OneByOneAllocTest() {
for (size_t i = 0; i < alloc_times_; ++i) {
size_t size = dis_(random_engine_);
std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
void* base_ptr = allocation->base_ptr();
void* system_ptr =
platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
EXPECT_EQ(base_ptr, system_ptr);
}
Release(place_);
}
void BatchByBatchAllocTest() {
std::vector<std::shared_ptr<Allocation>> allocations;
allocations.reserve(batch_size_);
size_t batch_num = alloc_times_ / batch_size_;
for (size_t i = 0; i < batch_num; ++i) {
for (size_t j = 0; j < batch_size_; ++j) {
size_t size = dis_(random_engine_);
std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
void* base_ptr = allocation->base_ptr();
void* system_ptr =
platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
EXPECT_EQ(base_ptr, system_ptr);
allocations.emplace_back(allocation);
}
allocations.clear();
}
Release(place_);
}
void ContinuousAllocTest() {
std::vector<std::shared_ptr<Allocation>> allocations;
allocations.reserve(alloc_times_);
for (size_t i = 0; i < alloc_times_; ++i) {
size_t size = dis_(random_engine_);
std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
void* base_ptr = allocation->base_ptr();
void* system_ptr =
platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
EXPECT_EQ(base_ptr, system_ptr);
allocations.emplace_back(allocation);
}
allocations.clear();
Release(place_);
}
void ZeroSizeAllocTest() {
std::shared_ptr<Allocation> allocation = AllocShared(place_, 0);
void* base_ptr = allocation->base_ptr();
void* system_ptr =
platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
EXPECT_EQ(base_ptr, system_ptr);
}
private:
platform::CUDAPlace place_;
size_t max_alloc_size_;
size_t alloc_times_;
size_t batch_size_;
std::default_random_engine random_engine_;
std::uniform_int_distribution<int> dis_;
};
TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) {
OneByOneAllocTest();
BatchByBatchAllocTest();
ContinuousAllocTest();
ZeroSizeAllocTest();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -42,7 +42,8 @@ namespace allocation {
class CUDADeviceContextAllocation : public Allocation {
public:
explicit CUDADeviceContextAllocation(AllocationPtr allocation)
: Allocation(allocation->ptr(), allocation->size(), allocation->place()),
: Allocation(allocation->ptr(), allocation->base_ptr(),
allocation->size(), allocation->place()),
underlying_allocation_(std::move(allocation)) {}
~CUDADeviceContextAllocation() {
......
......@@ -20,8 +20,9 @@ namespace allocation {
StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
AllocationPtr underlying_allocation, gpuStream_t owning_stream)
: Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
underlying_allocation->place()),
: Allocation(underlying_allocation->ptr(),
underlying_allocation->base_ptr(),
underlying_allocation->size(), underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)),
owning_stream_(std::move(owning_stream)) {}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <cstdlib>
#include <mutex>
#include <set>
#include <vector>
#include "gflags/gflags.h"
......@@ -197,6 +198,11 @@ class RecordedGpuMallocHelper {
if (result == gpuSuccess) {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
#endif
return gpuSuccess;
} else {
RaiseNonOutOfMemoryError(&result);
......@@ -233,7 +239,22 @@ class RecordedGpuMallocHelper {
// cudaErrorCudartUnloading /
// hipErrorDeinitialized
}
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.erase(ptr);
#endif
}
#ifdef PADDLE_WITH_TESTING
void *GetBasePtr(void *ptr) {
auto it = gpu_ptrs.upper_bound(ptr);
if (it == gpu_ptrs.begin()) {
return nullptr;
}
return *(--it);
}
#endif
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total) {
......@@ -301,7 +322,9 @@ class RecordedGpuMallocHelper {
static std::once_flag once_flag_;
static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
}; // NOLINT
std::set<void *> gpu_ptrs; // just for testing
}; // NOLINT
std::once_flag RecordedGpuMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
......@@ -352,5 +375,11 @@ void EmptyCache(void) {
}
}
#ifdef PADDLE_WITH_TESTING
void *GetGpuBasePtr(void *ptr, int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
}
#endif
} // namespace platform
} // namespace paddle
......@@ -145,6 +145,11 @@ bool IsGpuMallocRecorded(int dev_id);
//! Empty idle cached memory held by the allocator.
void EmptyCache(void);
//! Get the primitive pointer return from cudaMalloc, just for testing
#ifdef PADDLE_WITH_TESTING
void *GetGpuBasePtr(void *ptr, int dev_id);
#endif
} // namespace platform
} // namespace paddle
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册