diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index c0d1934a703b66a8ab8a1eab0c1d0680d73b9e17..e0e179a0860e1d36f3b193c10f7bc414a3b57b08 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -125,3 +125,10 @@ if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) endif(NOT WIN32) + +if(WITH_GPU AND WITH_TESTING) + nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info) + set_tests_properties(base_ptr_test PROPERTIES + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; + FLAGS_use_stream_safe_cuda_allocator=true;") +endif() diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index f0b7f1a4b0d9e704cbe3595eb14d42e68bb4006b..10380c0d6028d57422e17a7c1dff7845ad0390f1 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -26,6 +26,7 @@ class AlignedAllocation : public Allocation { AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) : Allocation( reinterpret_cast(underlying_allocation->ptr()) + offset, + underlying_allocation->base_ptr(), underlying_allocation->size() - offset, underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)) {} diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index b11c657b96b74cc078f81bcd6201218a5045f340..ee802462ddc943244fc9cbdbcd7cb8cdd52f8e47 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -23,6 +23,8 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +DECLARE_string(allocator_strategy); + namespace paddle { namespace memory { namespace allocation { @@ -84,7 +86,10 @@ class Allocator; class Allocation { public: inline Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), size_(size), place_(place) {} + : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {} + inline Allocation(void* ptr, void* base_ptr, size_t size, + platform::Place place) + : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {} Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; @@ -98,6 +103,15 @@ class Allocation { // method like `defragmentation` to change `ptr_`. inline void* ptr() const { return ptr_; } + inline void* base_ptr() const { + PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", + paddle::platform::errors::Unimplemented( + "base_ptr() is only implemented for auto_growth " + "strategy, not support %s strategy", + FLAGS_allocator_strategy)); + return base_ptr_; + } + // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the // last valid element. // @@ -126,6 +140,7 @@ class Allocation { private: void* ptr_; + void* base_ptr_; // the point that directly requested from system size_t size_; platform::Place place_; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index a2d198aba032308d2f5233e71b832f6515b54f6b..44cd915b168153343d9d1e3a7896cf6d2309b978 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -90,9 +90,9 @@ class CUDAGraphAllocator public: PrivateAllocation(CUDAGraphAllocator* allocator, AllocationPtr underlying_allocation) - : Allocation(underlying_allocation->ptr(), - underlying_allocation->size(), - underlying_allocation->place()), + : Allocation( + underlying_allocation->ptr(), underlying_allocation->base_ptr(), + underlying_allocation->size(), underlying_allocation->place()), allocator_(allocator->shared_from_this()), underlying_allocation_(std::move(underlying_allocation)) {} diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index d1fa6cce0164f6bc6a4fc330bc0d25bdd087fd5a..2334a1b6d4d55285f49a08938d8625b818dddcc8 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -73,7 +73,8 @@ class AutoGrowthBestFitAllocator : public Allocator { struct BlockAllocation : public Allocation { explicit BlockAllocation(const List::iterator &it) - : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()), + : Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_, + it->chunk_->allocation_->place()), block_it_(it) {} List::iterator block_it_; diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b284c9899dbb021cb1fe98ccf3d8d15ff452646 --- /dev/null +++ b/paddle/fluid/memory/allocation/base_ptr_test.cu @@ -0,0 +1,118 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CUDAAllocatoionBasePtrTest : public ::testing::Test { + public: + void SetUp() override { + place_ = platform::CUDAPlace(); + alloc_times_ = 100; + batch_size_ = 10; + max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_; + random_engine_ = std::default_random_engine(time(NULL)); + dis_ = std::uniform_int_distribution(0, max_alloc_size_); + } + + void OneByOneAllocTest() { + for (size_t i = 0; i < alloc_times_; ++i) { + size_t size = dis_(random_engine_); + std::shared_ptr allocation = AllocShared(place_, size); + + void* base_ptr = allocation->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + } + + Release(place_); + } + + void BatchByBatchAllocTest() { + std::vector> allocations; + allocations.reserve(batch_size_); + size_t batch_num = alloc_times_ / batch_size_; + + for (size_t i = 0; i < batch_num; ++i) { + for (size_t j = 0; j < batch_size_; ++j) { + size_t size = dis_(random_engine_); + std::shared_ptr allocation = AllocShared(place_, size); + + void* base_ptr = allocation->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + + allocations.emplace_back(allocation); + } + allocations.clear(); + } + + Release(place_); + } + + void ContinuousAllocTest() { + std::vector> allocations; + allocations.reserve(alloc_times_); + + for (size_t i = 0; i < alloc_times_; ++i) { + size_t size = dis_(random_engine_); + std::shared_ptr allocation = AllocShared(place_, size); + + void* base_ptr = allocation->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + + allocations.emplace_back(allocation); + } + + allocations.clear(); + Release(place_); + } + + void ZeroSizeAllocTest() { + std::shared_ptr allocation = AllocShared(place_, 0); + void* base_ptr = allocation->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + } + + private: + platform::CUDAPlace place_; + size_t max_alloc_size_; + size_t alloc_times_; + size_t batch_size_; + std::default_random_engine random_engine_; + std::uniform_int_distribution dis_; +}; + +TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) { + OneByOneAllocTest(); + BatchByBatchAllocTest(); + ContinuousAllocTest(); + ZeroSizeAllocTest(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 9e04fd3f0619e3c7a06fc4453ee3f2f0b28a786e..33cf2fe05424778b88eae135f582d3d39405e55a 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -42,7 +42,8 @@ namespace allocation { class CUDADeviceContextAllocation : public Allocation { public: explicit CUDADeviceContextAllocation(AllocationPtr allocation) - : Allocation(allocation->ptr(), allocation->size(), allocation->place()), + : Allocation(allocation->ptr(), allocation->base_ptr(), + allocation->size(), allocation->place()), underlying_allocation_(std::move(allocation)) {} ~CUDADeviceContextAllocation() { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 7b6b61d7a60ca8ab68388820811f7c684f65cc95..0d0318859c6262875fcea5b08ba60c7a65233b8f 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -20,8 +20,9 @@ namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( AllocationPtr underlying_allocation, gpuStream_t owning_stream) - : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), - underlying_allocation->place()), + : Allocation(underlying_allocation->ptr(), + underlying_allocation->base_ptr(), + underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), owning_stream_(std::move(owning_stream)) {} diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index e68277cc37b381a80e33860543c241f5e137c5be..e09d07a6e39634169fb99aca087ddaf025b18c40 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include #include +#include #include #include "gflags/gflags.h" @@ -197,6 +198,11 @@ class RecordedGpuMallocHelper { if (result == gpuSuccess) { cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); + +#ifdef PADDLE_WITH_TESTING + gpu_ptrs.insert(*ptr); +#endif + return gpuSuccess; } else { RaiseNonOutOfMemoryError(&result); @@ -233,7 +239,22 @@ class RecordedGpuMallocHelper { // cudaErrorCudartUnloading / // hipErrorDeinitialized } +#ifdef PADDLE_WITH_TESTING + gpu_ptrs.erase(ptr); +#endif + } + +#ifdef PADDLE_WITH_TESTING + void *GetBasePtr(void *ptr) { + auto it = gpu_ptrs.upper_bound(ptr); + + if (it == gpu_ptrs.begin()) { + return nullptr; + } + + return *(--it); } +#endif bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, size_t *actual_total) { @@ -301,7 +322,9 @@ class RecordedGpuMallocHelper { static std::once_flag once_flag_; static std::vector> instances_; -}; // NOLINT + + std::set gpu_ptrs; // just for testing +}; // NOLINT std::once_flag RecordedGpuMallocHelper::once_flag_; std::vector> @@ -352,5 +375,11 @@ void EmptyCache(void) { } } +#ifdef PADDLE_WITH_TESTING +void *GetGpuBasePtr(void *ptr, int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr); +} +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index 18e6ac83295f89108ea21555a35d2d7f66b3f3dd..9bc4d70bc457d009b771089f4b2acc4fcbc0ea07 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -145,6 +145,11 @@ bool IsGpuMallocRecorded(int dev_id); //! Empty idle cached memory held by the allocator. void EmptyCache(void); +//! Get the primitive pointer return from cudaMalloc, just for testing +#ifdef PADDLE_WITH_TESTING +void *GetGpuBasePtr(void *ptr, int dev_id); +#endif + } // namespace platform } // namespace paddle