From 0a73f7202af19420f10d70dc58ccc2c495c879a1 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Sun, 1 Sep 2019 00:10:52 +0800
Subject: [PATCH] Add retry_allocator for gpu (#19409)

* add retry_allocator for gpu, test=develop

* follow chengduoZH's comments, test=develop

* follow huihuang's comments,test=develop

* change f,l in enforce.h to be file,line, test=develop

* increase code coverage by adding unittests, test=develop

* fix CMakeLists.txt, test=develop
---
 paddle/fluid/memory/allocation/CMakeLists.txt |  5 +
 paddle/fluid/memory/allocation/allocator.h    | 13 +--
 .../memory/allocation/allocator_facade.cc     | 29 +++++-
 .../memory/allocation/best_fit_allocator.cc   |  4 +-
 .../fluid/memory/allocation/cuda_allocator.cc |  7 +-
 .../allocation/naive_best_fit_allocator.h     |  2 +
 .../memory/allocation/retry_allocator.cc      | 53 +++++++++--
 .../fluid/memory/allocation/retry_allocator.h |  6 +-
 .../memory/allocation/retry_allocator_test.cc | 51 +++++++++-
 paddle/fluid/memory/detail/buddy_allocator.cc | 79 ----------------
 paddle/fluid/memory/detail/buddy_allocator.h  |  9 --
 .../fluid/memory/detail/system_allocator.cc   | 27 +++---
 paddle/fluid/platform/enforce.h               | 94 ++++++++++---------
 13 files changed, 210 insertions(+), 169 deletions(-)
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 888c214ee9..76880106fa 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -46,7 +46,12 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
+  if (WITH_GPU)
+    target_link_libraries(retry_allocator_test cuda_allocator)
+  endif()
+
   set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_property(TEST retry_allocator_test PROPERTY ENVIRONMENT GLOG_vmodule=retry_allocator=10)
 endif()
 
 cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 5d7c9bde92..379c8d0096 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -19,6 +19,7 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/inlined_vector.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -26,14 +27,14 @@ namespace memory {
 namespace allocation {
 
 // Exception when `Alloc`/`AllocShared` failed
-class BadAlloc : public std::exception {
- public:
-  inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
+struct BadAlloc : public std::exception {
+  inline explicit BadAlloc(std::string err_msg, const char* file, int line)
+      : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
+  }
 
-  inline const char* what() const noexcept override { return msg_.c_str(); }
+  const char* what() const noexcept override { return err_str_.c_str(); }
 
- private:
-  std::string msg_;
+  std::string err_str_;
 };
 
 class Allocator;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 77b95f7160..cfc306dd35 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,7 +37,7 @@
 #endif
 
 DEFINE_int64(
-    gpu_allocator_retry_time, 0,
+    gpu_allocator_retry_time, 100,
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
 
@@ -80,6 +80,12 @@ class AllocatorFacadePrivate {
       }
     }
     InitZeroSizeAllocators();
+
+    if (FLAGS_gpu_allocator_retry_time > 0) {
+      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
+    }
+
+    CheckAllocThreadSafe();
   }
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -118,6 +124,8 @@ class AllocatorFacadePrivate {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
 
+    bool IsAllocThreadSafe() const override { return true; }
+
    protected:
     Allocation* AllocateImpl(size_t size) override {
       return new Allocation(nullptr, 0, place_);
@@ -145,6 +153,25 @@ class AllocatorFacadePrivate {
     }
   }
 
+  void CheckAllocThreadSafe() const {
+    for (auto& pair : allocators_) {
+      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
+    }
+
+    for (auto& pair : zero_size_allocators_) {
+      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
+    }
+  }
+
+  void WrapCUDARetryAllocator(size_t retry_time) {
+    PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0");
+    for (auto& pair : allocators_) {
+      if (platform::is_gpu_place(pair.first)) {
+        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
+      }
+    }
+  }
+
  private:
   std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
   std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 72ee4e5411..126464f049 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -150,8 +150,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) {
     }
   }
   if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
-    throw BadAlloc(string::Sprintf(
-        "Cannot allocate %d, All fragments size is %d", size, FreeSize()));
+    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
+                           FreeSize());
   }
   auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
   return new BestFitAllocation(this, chunk_it);
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 349c71cece..10e275ab4f 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -17,6 +17,7 @@
 #include <cuda_runtime.h>
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -36,9 +37,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   void* ptr;
   auto status = cudaMalloc(&ptr, size);
   if (UNLIKELY(status != cudaSuccess)) {
-    throw BadAlloc(string::Sprintf(
-        "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
-        status, cudaGetErrorString(status)));
+    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
+                           size, place_.device, status,
+                           cudaGetErrorString(status));
   }
   return new Allocation(ptr, size, platform::Place(place_));
 }
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 5b376e6c20..913d583099 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -28,6 +28,8 @@ class NaiveBestFitAllocator : public Allocator {
  public:
   explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {}
 
+  bool IsAllocThreadSafe() const override { return true; }
+
  protected:
   Allocation *AllocateImpl(size_t size) override;
   void FreeImpl(Allocation *allocation) override;
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index bf14ed5db1..ae6af53241 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,14 +13,40 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
+class WaitedAllocateSizeGuard {
+ public:
+  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
+                          size_t requested_size)
+      : waited_size_(waited_size), requested_size_(requested_size) {
+    waited_size_->fetch_add(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+  ~WaitedAllocateSizeGuard() {
+    waited_size_->fetch_sub(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<size_t>* waited_size_;
+  size_t requested_size_;
+};
+
 void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
+  size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
-  cv_.notify_all();
+  if (UNLIKELY(waited_allocate_size_)) {
+    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
+                                   "where waited_allocate_size_ = "
+             << waited_allocate_size_;
+    cv_.notify_all();
+  }
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size) {
@@ -31,29 +57,38 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
   // But it would add lock even when allocation success at the first time
   try {
     return alloc_func();
-  } catch (BadAlloc& bad_alloc) {
+  } catch (BadAlloc&) {
     {
+      WaitedAllocateSizeGuard guard(&waited_allocate_size_, size);
+      VLOG(10) << "Allocation failed when allocating " << size
+               << " bytes, waited_allocate_size_ = " << waited_allocate_size_;
       // We can just write allocation retry inside the predicate function of
-      // wait_until
-      // But it needs to acquire the lock when executing predicate function
-      // For better performance, we use loop here
+      // wait_until. But it needs to acquire the lock when executing predicate
+      // function. For better performance, we use loop here
       auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
       auto wait_until = [&, this] {
         std::unique_lock<std::mutex> lock(mutex_);
         return cv_.wait_until(lock, end_time);
       };
+
+      size_t retry_time = 0;
       while (wait_until() != std::cv_status::timeout) {
         try {
           return alloc_func();
-        } catch (BadAlloc& ex) {
-          bad_alloc = ex;
+        } catch (BadAlloc&) {
+          // do nothing when it is not timeout
+          ++retry_time;
+          VLOG(10) << "Allocation failed when retrying " << retry_time
+                   << " times when allocating " << size
+                   << " bytes. Wait still.";
         } catch (...) {
           throw;
         }
       }
-
-      throw;  // rethrow the original exception or throw the internal bad_alloc
     }
+    VLOG(10) << "Allocation failed because of timeout when allocating " << size
+             << " bytes.";
+    return alloc_func();  // If timeout, try last allocation request.
   } catch (...) {
     throw;
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 7840a83447..7c218e25c4 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
+#include <atomic>              // NOLINT
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace memory {
@@ -48,9 +50,7 @@ class RetryAllocator : public Allocator {
   std::mutex mutex_;
   std::condition_variable cv_;
 
-  // For debug, We can add an atomic integer to record how many memory sizes are
-  // waited to allocate
-  // std::atomic<size_t> waited_allocate_size_{0};
+  std::atomic<size_t> waited_allocate_size_{0};
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 4ac08d442d..8b24ff28b7 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -17,12 +17,16 @@
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <mutex>               // NOLINT
-#include <thread>              // NOLINT
+#include <string>
+#include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#endif
 
 namespace paddle {
 namespace memory {
@@ -93,6 +97,51 @@ TEST(RetryAllocator, RetryAllocator) {
   }
 }
 
+class DummyAllocator : public Allocator {
+ public:
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override {
+    PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
+  }
+
+  void FreeImpl(Allocation *) override {}
+};
+
+TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
+  size_t retry_ms = 10;
+  {
+    RetryAllocator allocator(std::make_shared<DummyAllocator>(), retry_ms);
+    try {
+      auto allocation = allocator.Allocate(100);
+      ASSERT_TRUE(false);
+      allocation.reset();
+    } catch (BadAlloc &ex) {
+      ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
+                  std::string::npos);
+    }
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    platform::CUDAPlace p(0);
+    RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
+    size_t allocate_size = -1UL;  // Very large number
+    try {
+      auto allocation = allocator.Allocate(allocate_size);
+      ASSERT_TRUE(false);
+      allocation.reset();
+    } catch (BadAlloc &ex) {
+      ASSERT_TRUE(std::string(ex.what()).find(
+                      "Cannot allocate " + std::to_string(allocate_size) +
+                      " on GPU " + std::to_string(p.device)) !=
+                  std::string::npos);
+    }
+  }
+#endif
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 8fce86eeec..2519a9587a 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -164,15 +164,6 @@ void BuddyAllocator::Free(void* p) {
            << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
-
-  if (FLAGS_free_idle_memory) {
-    // Clean up if existing too much free memory
-    // Prefer freeing fallback allocation first
-    CleanIdleFallBackAlloc();
-
-    // Free normal allocation
-    CleanIdleNormalAlloc();
-  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
@@ -225,12 +216,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      allocate_bytes, nullptr, nullptr);
 
-  // gpu fallback allocation
-  if (system_allocator_->UseGpu() &&
-      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
-    fallback_alloc_count_++;
-  }
-
   total_free_ += allocate_bytes;
 
   // dump the block into pool
@@ -288,70 +273,6 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   return block;
 }
 
-void BuddyAllocator::CleanIdleFallBackAlloc() {
-  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_) return;
-
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-
-    // If no GPU fallback allocator, return
-    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
-      return;
-    }
-
-    VLOG(10) << "Return block " << block << " to fallback allocator.";
-
-    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
-    cache_.invalidate(block);
-
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-
-    total_free_ -= block->size(cache_);
-    fallback_alloc_count_--;
-
-    // If no fall allocation exists, return directly
-    if (!fallback_alloc_count_) return;
-  }
-}
-
-void BuddyAllocator::CleanIdleNormalAlloc() {
-  auto shall_free_alloc = [&]() -> bool {
-    // free all fallback allocations
-    if (fallback_alloc_count_ > 0) {
-      return true;
-    }
-    // keep 2x overhead if we haven't fallen back
-    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
-      return true;
-    }
-    return false;
-  };
-
-  if (!shall_free_alloc()) return;
-
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-
-    VLOG(10) << "Return block " << block << " to base allocator.";
-
-    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
-    cache_.invalidate(block);
-
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-
-    total_free_ -= block->size(cache_);
-
-    if (!shall_free_alloc()) return;
-  }
-}
-
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2cf4bba822..791f8b5627 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -75,12 +75,6 @@ class BuddyAllocator {
   /*! \brief Find the existing chunk which used to allocation */
   PoolSet::iterator FindExistChunk(size_t size);
 
-  /*! \brief Clean idle fallback allocation */
-  void CleanIdleFallBackAlloc();
-
-  /*! \brief Clean idle normal allocation */
-  void CleanIdleNormalAlloc();
-
  private:
   size_t total_used_ = 0;  // the total size of used memory
   size_t total_free_ = 0;  // the total size of free memory
@@ -98,9 +92,6 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
-  /*! Record fallback allocation count for auto-scaling */
-  size_t fallback_alloc_count_ = 0;
-
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 9721a33d2f..6d68aa165a 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <algorithm>  // for std::max
 
 #include "gflags/gflags.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -117,19 +118,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-                 << " MB GPU memory. Please shrink "
-                    "FLAGS_fraction_of_gpu_memory_to_use or "
-                    "FLAGS_initial_gpu_memory_in_mb or "
-                    "FLAGS_reallocate_gpu_memory_in_mb"
-                    "environment variable to a lower value. "
-                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
-                 << FLAGS_fraction_of_gpu_memory_to_use
-                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
-                 << FLAGS_initial_gpu_memory_in_mb
-                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
-                 << FLAGS_reallocate_gpu_memory_in_mb;
-    return nullptr;
+    PADDLE_THROW_BAD_ALLOC(
+        "Cannot malloc " + std::to_string(size / 1024.0 / 1024.0) +
+        " MB GPU memory. Please shrink "
+        "FLAGS_fraction_of_gpu_memory_to_use or "
+        "FLAGS_initial_gpu_memory_in_mb or "
+        "FLAGS_reallocate_gpu_memory_in_mb"
+        "environment variable to a lower value. " +
+        "Current FLAGS_fraction_of_gpu_memory_to_use value is " +
+        std::to_string(FLAGS_fraction_of_gpu_memory_to_use) +
+        ". Current FLAGS_initial_gpu_memory_in_mb value is " +
+        std::to_string(FLAGS_initial_gpu_memory_in_mb) +
+        ". Current FLAGS_reallocate_gpu_memory_in_mb value is " +
+        std::to_string(FLAGS_reallocate_gpu_memory_in_mb));
   }
 }
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 862ea8979a..1e0cf0a405 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -66,25 +66,61 @@ inline std::string demangle(std::string name) {
 inline std::string demangle(std::string name) { return name; }
 #endif
 
+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  static constexpr int TRACE_STACK_LIMIT = 100;
+  std::ostringstream sout;
+
+  sout << string::Sprintf("%s at [%s:%d]", std::forward<StrType>(what), file,
+                          line)
+       << std::endl;
+  sout << "PaddlePaddle Call Stacks: " << std::endl;
+#if !defined(_WIN32)
+  void* call_stack[TRACE_STACK_LIMIT];
+  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+  auto symbols = backtrace_symbols(call_stack, size);
+  Dl_info info;
+  for (int i = 0; i < size; ++i) {
+    if (dladdr(call_stack[i], &info) && info.dli_sname) {
+      auto demangled = demangle(info.dli_sname);
+      auto addr_offset = static_cast<char*>(call_stack[i]) -
+                         static_cast<char*>(info.dli_saddr);
+      sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2,
+                              call_stack[i], demangled, addr_offset);
+    } else {
+      sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                              call_stack[i]);
+    }
+  }
+  free(symbols);
+#else
+  sout << "Windows not support stack backtrace yet.";
+#endif
+  return sout.str();
+}
+
 struct EnforceNotMet : public std::exception {
   std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
+  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
     try {
       std::rethrow_exception(e);
     } catch (std::exception& e) {
-      Init(e.what(), f, l);
+      err_str_ = GetTraceBackString(e.what(), file, line);
+      SaveErrorInformation(err_str_);
     }
   }
 
-  EnforceNotMet(const std::string& str, const char* f, int l) {
-    Init(str, f, l);
+  EnforceNotMet(const std::string& str, const char* file, int line)
+      : err_str_(GetTraceBackString(str, file, line)) {
+    SaveErrorInformation(err_str_);
   }
 
   const char* what() const noexcept override { return err_str_.c_str(); }
 
  private:
-  const std::string output_file_name{"paddle_err_info"};
-  void saveErrorInformation(const std::string& err) {
+  static void SaveErrorInformation(const std::string& err) {
+    const std::string output_file_name{"paddle_err_info"};
     std::stringstream ss;
     ss << output_file_name;
     std::time_t t = std::time(nullptr);
@@ -98,49 +134,15 @@ struct EnforceNotMet : public std::exception {
       err_file.close();
     }
   }
-
-  template <typename StrType>
-  inline void Init(StrType what, const char* f, int l) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
-    std::ostringstream sout;
-
-    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
-    sout << "PaddlePaddle Call Stacks: " << std::endl;
-#if !defined(_WIN32)
-    void* call_stack[TRACE_STACK_LIMIT];
-    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-    auto symbols = backtrace_symbols(call_stack, size);
-    Dl_info info;
-    for (int i = 0; i < size; ++i) {
-      if (dladdr(call_stack[i], &info) && info.dli_sname) {
-        auto demangled = demangle(info.dli_sname);
-        auto addr_offset = static_cast<char*>(call_stack[i]) -
-                           static_cast<char*>(info.dli_saddr);
-        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-                                2 + sizeof(void*) * 2, call_stack[i], demangled,
-                                addr_offset);
-      } else {
-        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                                call_stack[i]);
-      }
-    }
-    free(symbols);
-#else
-    sout << "Windows not support stack backtrace yet.";
-#endif
-    err_str_ = sout.str();
-
-    saveErrorInformation(err_str_);
-  }
 };
 
 struct EOFException : public std::exception {
   std::string err_str_;
-  EOFException(const char* err_msg, const char* f, int l) {
-    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
+  EOFException(const char* err_msg, const char* file, int line) {
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
   }
 
-  const char* what() const noexcept { return err_str_.c_str(); }
+  const char* what() const noexcept override { return err_str_.c_str(); }
 };
 
 // Because most enforce conditions would evaluate to true, we can use
@@ -329,6 +331,12 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
                                            __LINE__);                          \
   } while (0)
 
+#define PADDLE_THROW_BAD_ALLOC(...)                                  \
+  do {                                                               \
+    throw ::paddle::memory::allocation::BadAlloc(                    \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
+
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
-- 
GitLab