refine auto_growth allocator (#35732)

* do not use alignedAllocator when cuda has alignment * update test * fix error during multiple process

refine auto_growth allocator (#35732)
* do not use alignedAllocator when cuda has alignment * update test * fix error during multiple process
6d353aa5 · Leo Chen · GitHub · e5b4dd73 · 6d353aa5 · 6d353aa5
4 changed file
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
+// For memory address alignment
 class AlignedAllocation : public Allocation {
 public:
  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -23,6 +23,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -201,6 +202,8 @@ class AllocatorFacadePrivate {
  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
+    VLOG(4) << "GetAllocator"
+            << " " << place << " " << size;
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                          : GetAllocatorMap())
@@ -256,8 +259,39 @@ class AllocatorFacadePrivate {
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+        underlying_allocator, alignment, 0, allow_free_idle_chunk);
  }
 #endif

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -40,14 +40,14 @@ namespace allocation {
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
    const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
    size_t chunk_size, bool allow_free_idle_chunk)
-    : underlying_allocator_(
+    : underlying_allocator_(underlying_allocator),
-          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
      alignment_(alignment),
      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
      allow_free_idle_chunk_(allow_free_idle_chunk) {}
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
-  size = AlignedSize(size, alignment_);
+  size_t size = AlignedSize(unaligned_size, alignment_);
+  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
  std::lock_guard<SpinLock> guard(spinlock_);
  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
    free_blocks_.erase(iter);
    auto *chunk = block_it->chunk_;
    size_t remaining_size = block_it->size_ - size;
+    VLOG(10) << "Allocate " << size << " bytes from chunk size "
+             << block_it->size_ << ", remaining " << remaining_size;
    if (remaining_size == 0) {
      block_it->is_free_ = false;
    } else {
@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
    }
    blocks.emplace_back(p + remaining_size, size, false, chunk);
    block_it = --(blocks.end());
-    VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
+    VLOG(2) << "Not found and reallocate " << realloc_size << "("
-            << remaining_size;
+            << static_cast<void *>(p) << "), and remaining " << remaining_size;
  }
  return new BlockAllocation(block_it);
 }
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+  VLOG(10) << "Free " << allocation->size() << " bytes";
  std::lock_guard<SpinLock> guard(spinlock_);
  auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
  auto &blocks = block_it->chunk_->blocks_;

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include <cstdlib>
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "gtest/gtest.h"
 DECLARE_bool(free_idle_chunk);
@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
  FLAGS_free_idle_chunk = free_idle_chunk;
  FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
  auto recorded_allocator = std::make_shared<RecordedAllocator>();
  size_t alignment = 4096;
  size_t memory_size = 8192;
+  auto underlying_allocator =
+      std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
  auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      recorded_allocator, alignment);
+      underlying_allocator, alignment);
  for (size_t i = 0; i < 10; ++i) {
    auto allocation = ag_allocator->Allocate(memory_size);
@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
  auto underlying_allocator =
      std::make_shared<LimitedResourceAllocator>(memory_capacity);
+  auto aligned_allocator =
+      std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
  auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      underlying_allocator, alignment);
+      aligned_allocator, alignment);
  ag_allocator->Allocate(allocate_size[0]);
  ASSERT_EQ(underlying_allocator->AllocatedSize(),