Get base pointer from Allocation (#37978)

* Get GPU BasePtr from CUDA allocation * Fix compile error for ROCm * Add BasePtr function for IPUPlace in naive_best_fit_allocator.cc * Add alignment for BuddyAllocator * Set address alignment of BuddyAllocator to 32 bytes * Fix CI error * Remove code for naive_best_fit strategy

Get base pointer from Allocation (#37978)
* Get GPU BasePtr from CUDA allocation * Fix compile error for ROCm * Add BasePtr function for IPUPlace in naive_best_fit_allocator.cc * Add alignment for BuddyAllocator * Set address alignment of BuddyAllocator to 32 bytes * Fix CI error * Remove code for naive_best_fit strategy
431a2d6a · From00 · GitHub · b0d12d99 · 431a2d6a · 431a2d6a
10 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -125,3 +125,10 @@ if(NOT WIN32)
  cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
  cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
 endif(NOT WIN32)
+
+if(WITH_GPU AND WITH_TESTING)       
+  nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info)
+  set_tests_properties(base_ptr_test PROPERTIES 
+                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
+                                    FLAGS_use_stream_safe_cuda_allocator=true;")
+endif()
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -26,6 +26,7 @@ class AlignedAllocation : public Allocation {
  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
      : Allocation(
            reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
+            underlying_allocation->base_ptr(),
            underlying_allocation->size() - offset,
            underlying_allocation->place()),
        underlying_allocation_(std::move(underlying_allocation)) {}

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -23,6 +23,8 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"

+DECLARE_string(allocator_strategy);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -84,7 +86,10 @@ class Allocator;
 class Allocation {
 public:
  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {}
+      : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
+  inline Allocation(void* ptr, void* base_ptr, size_t size,
+                    platform::Place place)
+      : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}

  Allocation(const Allocation& o) = delete;
  Allocation& operator=(const Allocation& o) = delete;
@@ -98,6 +103,15 @@ class Allocation {
  // method like `defragmentation` to change `ptr_`.
  inline void* ptr() const { return ptr_; }

+  inline void* base_ptr() const {
+    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
+                      paddle::platform::errors::Unimplemented(
+                          "base_ptr() is only implemented for auto_growth "
+                          "strategy, not support %s strategy",
+                          FLAGS_allocator_strategy));
+    return base_ptr_;
+  }
+
  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
  // last valid element.
  //
@@ -126,6 +140,7 @@ class Allocation {

 private:
  void* ptr_;
+  void* base_ptr_;  // the point that directly requested from system
  size_t size_;
  platform::Place place_;


--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -90,9 +90,9 @@ class CUDAGraphAllocator
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
                      AllocationPtr underlying_allocation)
-        : Allocation(underlying_allocation->ptr(),
-                     underlying_allocation->size(),
-                     underlying_allocation->place()),
+        : Allocation(
+              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
+              underlying_allocation->size(), underlying_allocation->place()),
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}


--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -73,7 +73,8 @@ class AutoGrowthBestFitAllocator : public Allocator {

  struct BlockAllocation : public Allocation {
    explicit BlockAllocation(const List<Block>::iterator &it)
-        : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
+        : Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_,
+                     it->chunk_->allocation_->place()),
          block_it_(it) {}

    List<Block>::iterator block_it_;

--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CUDAAllocatoionBasePtrTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    alloc_times_ = 100;
+    batch_size_ = 10;
+    max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_;
+    random_engine_ = std::default_random_engine(time(NULL));
+    dis_ = std::uniform_int_distribution<int>(0, max_alloc_size_);
+  }
+
+  void OneByOneAllocTest() {
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+      void* base_ptr = allocation->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+    }
+
+    Release(place_);
+  }
+
+  void BatchByBatchAllocTest() {
+    std::vector<std::shared_ptr<Allocation>> allocations;
+    allocations.reserve(batch_size_);
+    size_t batch_num = alloc_times_ / batch_size_;
+
+    for (size_t i = 0; i < batch_num; ++i) {
+      for (size_t j = 0; j < batch_size_; ++j) {
+        size_t size = dis_(random_engine_);
+        std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+        void* base_ptr = allocation->base_ptr();
+        void* system_ptr =
+            platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+        EXPECT_EQ(base_ptr, system_ptr);
+
+        allocations.emplace_back(allocation);
+      }
+      allocations.clear();
+    }
+
+    Release(place_);
+  }
+
+  void ContinuousAllocTest() {
+    std::vector<std::shared_ptr<Allocation>> allocations;
+    allocations.reserve(alloc_times_);
+
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+      void* base_ptr = allocation->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+
+      allocations.emplace_back(allocation);
+    }
+
+    allocations.clear();
+    Release(place_);
+  }
+
+  void ZeroSizeAllocTest() {
+    std::shared_ptr<Allocation> allocation = AllocShared(place_, 0);
+    void* base_ptr = allocation->base_ptr();
+    void* system_ptr =
+        platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+    EXPECT_EQ(base_ptr, system_ptr);
+  }
+
+ private:
+  platform::CUDAPlace place_;
+  size_t max_alloc_size_;
+  size_t alloc_times_;
+  size_t batch_size_;
+  std::default_random_engine random_engine_;
+  std::uniform_int_distribution<int> dis_;
+};
+
+TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) {
+  OneByOneAllocTest();
+  BatchByBatchAllocTest();
+  ContinuousAllocTest();
+  ZeroSizeAllocTest();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -42,7 +42,8 @@ namespace allocation {
 class CUDADeviceContextAllocation : public Allocation {
 public:
  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+      : Allocation(allocation->ptr(), allocation->base_ptr(),
+                   allocation->size(), allocation->place()),
        underlying_allocation_(std::move(allocation)) {}

  ~CUDADeviceContextAllocation() {

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -20,8 +20,9 @@ namespace allocation {

 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
-    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
-                 underlying_allocation->place()),
+    : Allocation(underlying_allocation->ptr(),
+                 underlying_allocation->base_ptr(),
+                 underlying_allocation->size(), underlying_allocation->place()),
      underlying_allocation_(std::move(underlying_allocation)),
      owning_stream_(std::move(owning_stream)) {}


--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include <cstdlib>
 #include <mutex>
+#include <set>
 #include <vector>

 #include "gflags/gflags.h"
@@ -197,6 +198,11 @@ class RecordedGpuMallocHelper {
    if (result == gpuSuccess) {
      cur_size_.fetch_add(size);
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+
+#ifdef PADDLE_WITH_TESTING
+      gpu_ptrs.insert(*ptr);
+#endif
+
      return gpuSuccess;
    } else {
      RaiseNonOutOfMemoryError(&result);
@@ -233,7 +239,22 @@ class RecordedGpuMallocHelper {
                                    // cudaErrorCudartUnloading /
                                    // hipErrorDeinitialized
    }
+#ifdef PADDLE_WITH_TESTING
+    gpu_ptrs.erase(ptr);
+#endif
+  }
+
+#ifdef PADDLE_WITH_TESTING
+  void *GetBasePtr(void *ptr) {
+    auto it = gpu_ptrs.upper_bound(ptr);
+
+    if (it == gpu_ptrs.begin()) {
+      return nullptr;
+    }
+
+    return *(--it);
  }
+#endif

  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
                  size_t *actual_total) {
@@ -301,6 +322,8 @@ class RecordedGpuMallocHelper {

  static std::once_flag once_flag_;
  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+
+  std::set<void *> gpu_ptrs;  // just for testing
 };                            // NOLINT

 std::once_flag RecordedGpuMallocHelper::once_flag_;
@@ -352,5 +375,11 @@ void EmptyCache(void) {
  }
 }

+#ifdef PADDLE_WITH_TESTING
+void *GetGpuBasePtr(void *ptr, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
+}
+#endif
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -145,6 +145,11 @@ bool IsGpuMallocRecorded(int dev_id);
 //! Empty idle cached memory held by the allocator.
 void EmptyCache(void);

+//! Get the primitive pointer return from cudaMalloc, just for testing
+#ifdef PADDLE_WITH_TESTING
+void *GetGpuBasePtr(void *ptr, int dev_id);
+#endif
+
 }  // namespace platform
 }  // namespace paddle