From aa4d6a5d6caf4320fdb6f6eb1a7303713e39bc83 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 18 Dec 2019 21:30:48 -0600
Subject: [PATCH] Add some debug flags to auto growth allocator (#21766)

* add some debug flags to auto growth allocator, test=develop

* add comments about auto growth, test=develop
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   1 +
 .../auto_growth_best_fit_allocator.cc         |  24 ++-
 .../auto_growth_best_fit_allocator_test.cc    | 168 ++++++++++++++++++
 .../pybind/global_value_getter_setter.cc      |   4 +
 python/paddle/fluid/__init__.py               |   2 +-
 5 files changed, 195 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index ffae6e64808..dc3d9a1f56e 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -65,3 +65,4 @@ cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.c
 
 cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
+cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 9ce4fd07829..f71a4b8e1a8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -21,6 +21,18 @@
 #include <unordered_map>
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 
+DEFINE_bool(free_idle_chunk, false,
+            "Whether to free idle chunk when each allocation is freed. "
+            "If false, all freed allocation would be cached to speed up next "
+            "allocation request. If true, no allocation would be cached. This "
+            "flag only works when FLAGS_allocator_strategy=auto_growth.");
+
+DEFINE_bool(free_when_no_cache_hit, true,
+            "Whether to free idle chunks when no cache hit. If true, idle "
+            "chunk would be freed when no cache hit; if false, idle "
+            "chunk would be freed when out of memory occurs. This flag "
+            "only works when FLAGS_allocator_strategy=auto_growth.");
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -57,14 +69,16 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
       block_it->is_free_ = false;
     }
   } else {
-    FreeIdleChunks();
+    if (FLAGS_free_when_no_cache_hit) {
+      FreeIdleChunks();
+    }
     size_t realloc_size = std::max(size, chunk_size_);
 
     try {
       chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
     } catch (BadAlloc &ex) {
-      if (size == realloc_size) throw ex;
-      realloc_size = size;
+      if (FLAGS_free_when_no_cache_hit) throw ex;
+      FreeIdleChunks();
       chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
     }
 
@@ -118,6 +132,10 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
                        block_it);
 
   delete allocation;
+
+  if (FLAGS_free_idle_chunk) {
+    FreeIdleChunks();
+  }
 }
 
 void AutoGrowthBestFitAllocator::FreeIdleChunks() {
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
new file mode 100644
index 00000000000..8865bf0b0db
--- /dev/null
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+#include <cstdlib>
+#include <vector>
+#include "gtest/gtest.h"
+
+DECLARE_bool(free_idle_chunk);
+DECLARE_bool(free_when_no_cache_hit);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class RecordedAllocator : public Allocator {
+ protected:
+  Allocation *AllocateImpl(size_t size) override {
+    allocated_size_ += size;
+    return new Allocation(malloc(size), size, platform::CPUPlace());
+  }
+
+  void FreeImpl(Allocation *allocation) {
+    allocated_size_ -= allocation->size();
+    free(allocation->ptr());
+    delete allocation;
+  }
+
+ public:
+  size_t AllocatedSize() const { return allocated_size_; }
+
+ private:
+  size_t allocated_size_{0};
+};
+
+static void TestFreeIdleChunk(bool free_idle_chunk,
+                              bool free_when_no_cache_hit) {
+  FLAGS_free_idle_chunk = free_idle_chunk;
+  FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
+  auto recorded_allocator = std::make_shared<RecordedAllocator>();
+  size_t alignment = 4096;
+  size_t memory_size = 8192;
+  auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
+      recorded_allocator, alignment);
+
+  for (size_t i = 0; i < 10; ++i) {
+    auto allocation = ag_allocator->Allocate(memory_size);
+    ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment);
+    allocation.reset();
+    if (free_idle_chunk) {
+      ASSERT_EQ(recorded_allocator->AllocatedSize(), 0UL);
+    } else {
+      ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment);
+    }
+  }
+}
+
+class LimitedResourceAllocator : public Allocator {
+ public:
+  explicit LimitedResourceAllocator(size_t capacity) : capacity_(capacity) {}
+
+  size_t AllocatedSize() const { return allocated_size_; }
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override {
+    if (allocated_size_ + size > capacity_) {
+      throw BadAlloc("", __FILE__, __LINE__);
+    }
+
+    allocated_size_ += size;
+    return new Allocation(malloc(size), size, platform::CPUPlace());
+  }
+
+  void FreeImpl(Allocation *allocation) {
+    allocated_size_ -= allocation->size();
+    free(allocation->ptr());
+    delete allocation;
+  }
+
+ private:
+  size_t allocated_size_{0};
+  const size_t capacity_;
+};
+
+static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
+  FLAGS_free_idle_chunk = false;
+  FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
+  size_t alignment = 256;
+  size_t base_memory_size = 4096;
+
+  /*
+   * Suppose that we have 3 memory allocation request, that is:
+   *  - allocate x1, and then free x1
+   *  - allocate x2, and then free x2
+   *  - allocate x3, and then free x3
+   *
+   * where:
+   *  - x1 + alignment < x2
+   *  - x2 + alignment < x3
+   *  - x1 + x2 <= memory_capacity < x1 + x2 + x3
+   *
+   * In this unittest, we obtain memory_capacity by
+   * ((x1 + x2) + (x1 + x2 + x3) / 2 = x1 + x2 + x3 / 2.
+   *
+   * In this case, when FLAGS_free_when_no_cache_hit is true,
+   * the cached memory size when each allocation request ends
+   * would be: x1 + alignment, x2 + alignment, x3 + alignment.
+   *
+   * When FLAGS_free_when_no_cache_hit is false, the cached
+   * memory size when each allocation request ends would be:
+   * x1 + alignment, x1 + x2 + 2 * alignment, x3 + alignment.
+   */
+  std::vector<size_t> allocate_size = {base_memory_size,
+                                       base_memory_size + alignment * 2,
+                                       base_memory_size + alignment * 4};
+  size_t memory_capacity =
+      allocate_size[0] + allocate_size[1] + allocate_size[2] / 2;
+
+  auto underlying_allocator =
+      std::make_shared<LimitedResourceAllocator>(memory_capacity);
+  auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
+      underlying_allocator, alignment);
+
+  ag_allocator->Allocate(allocate_size[0]);
+  ASSERT_EQ(underlying_allocator->AllocatedSize(),
+            allocate_size[0] + alignment);
+
+  ag_allocator->Allocate(allocate_size[1]);
+  if (free_when_no_cache_hit) {
+    ASSERT_EQ(underlying_allocator->AllocatedSize(),
+              allocate_size[1] + alignment);
+  } else {
+    ASSERT_EQ(underlying_allocator->AllocatedSize(),
+              allocate_size[0] + allocate_size[1] + 2 * alignment);
+  }
+
+  ag_allocator->Allocate(allocate_size[2]);
+  ASSERT_EQ(underlying_allocator->AllocatedSize(),
+            allocate_size[2] + alignment);
+}
+
+TEST(test_auto_growth_allocator, test_free_idle_chunk) {
+  for (auto free_idle_chunk : {false, true}) {
+    for (auto free_when_no_cache_hit : {false, true}) {
+      TestFreeIdleChunk(free_idle_chunk, free_when_no_cache_hit);
+    }
+  }
+}
+
+TEST(test_auto_growth_allocator, test_free_when_no_cache_hit) {
+  TestFreeWhenNoCacheHit(false);
+  TestFreeWhenNoCacheHit(true);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index d84e0d94a68..4a0e09bb2ae 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -30,6 +30,8 @@ DECLARE_double(eager_delete_tensor_gb);
 DECLARE_bool(use_mkldnn);
 DECLARE_bool(use_ngraph);
 DECLARE_bool(use_system_allocator);
+DECLARE_bool(free_idle_chunk);
+DECLARE_bool(free_when_no_cache_hit);
 
 namespace paddle {
 namespace pybind {
@@ -167,6 +169,8 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_GLOBAL_VAR_GETTER_ONLY(FLAGS_use_ngraph);
   REGISTER_GLOBAL_VAR_GETTER_SETTER(FLAGS_eager_delete_tensor_gb);
   REGISTER_GLOBAL_VAR_GETTER_SETTER(FLAGS_use_system_allocator);
+  REGISTER_GLOBAL_VAR_GETTER_ONLY(FLAGS_free_idle_chunk);
+  REGISTER_GLOBAL_VAR_GETTER_ONLY(FLAGS_free_when_no_cache_hit);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 71c47eb4e08..e6eb7f0c999 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -171,7 +171,7 @@ def __bootstrap__():
         'enable_parallel_graph', 'fuse_parameter_groups_size',
         'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
         'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
-        'enable_unused_var_check'
+        'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
-- 
GitLab