Add retry_allocator for gpu (#19409)

* add retry_allocator for gpu, test=develop * follow chengduoZH's comments, test=develop * follow huihuang's comments,test=develop * change f,l in enforce.h to be file,line, test=develop * increase code coverage by adding unittests, test=develop * fix CMakeLists.txt, test=develop

Add retry_allocator for gpu (#19409)
* add retry_allocator for gpu, test=develop * follow chengduoZH's comments, test=develop * follow huihuang's comments,test=develop * change f,l in enforce.h to be file,line, test=develop * increase code coverage by adding unittests, test=develop * fix CMakeLists.txt, test=develop
0a73f720 · Zeng Jinle · GitHub · c756b5d2 · 0a73f720 · 0a73f720
13 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -46,7 +46,12 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
+  if (WITH_GPU)
+    target_link_libraries(retry_allocator_test cuda_allocator)
+  endif()
  set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_property(TEST retry_allocator_test PROPERTY ENVIRONMENT GLOG_vmodule=retry_allocator=10)
 endif()
 cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -19,6 +19,7 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/inlined_vector.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
@@ -26,14 +27,14 @@ namespace memory {
 namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
-class BadAlloc : public std::exception {
+struct BadAlloc : public std::exception {
- public:
+  inline explicit BadAlloc(std::string err_msg, const char* file, int line)
-  inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
+      : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
+  }
-  inline const char* what() const noexcept override { return msg_.c_str(); }
+  const char* what() const noexcept override { return err_str_.c_str(); }
- private:
+  std::string err_str_;
-  std::string msg_;
 };
 class Allocator;

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,7 +37,7 @@
 #endif
 DEFINE_int64(
-    gpu_allocator_retry_time, 0,
+    gpu_allocator_retry_time, 100,
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");
@@ -80,6 +80,12 @@ class AllocatorFacadePrivate {
      }
    }
    InitZeroSizeAllocators();
+    if (FLAGS_gpu_allocator_retry_time > 0) {
+      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
+    }
+    CheckAllocThreadSafe();
  }
  inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -118,6 +124,8 @@ class AllocatorFacadePrivate {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
+    bool IsAllocThreadSafe() const override { return true; }
   protected:
    Allocation* AllocateImpl(size_t size) override {
      return new Allocation(nullptr, 0, place_);
@@ -145,6 +153,25 @@ class AllocatorFacadePrivate {
    }
  }
+  void CheckAllocThreadSafe() const {
+    for (auto& pair : allocators_) {
+      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
+    }
+    for (auto& pair : zero_size_allocators_) {
+      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
+    }
+  }
+  void WrapCUDARetryAllocator(size_t retry_time) {
+    PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0");
+    for (auto& pair : allocators_) {
+      if (platform::is_gpu_place(pair.first)) {
+        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
+      }
+    }
+  }
 private:
  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;

--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -150,8 +150,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) {
    }
  }
  if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
-    throw BadAlloc(string::Sprintf(
+    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
-        "Cannot allocate %d, All fragments size is %d", size, FreeSize()));
+                           FreeSize());
  }
  auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
  return new BestFitAllocation(this, chunk_it);

--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -17,6 +17,7 @@
 #include <cuda_runtime.h>
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
@@ -36,9 +37,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
  void* ptr;
  auto status = cudaMalloc(&ptr, size);
  if (UNLIKELY(status != cudaSuccess)) {
-    throw BadAlloc(string::Sprintf(
+    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
-        "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
+                           size, place_.device, status,
-        status, cudaGetErrorString(status)));
+                           cudaGetErrorString(status));
  }
  return new Allocation(ptr, size, platform::Place(place_));
 }

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -28,6 +28,8 @@ class NaiveBestFitAllocator : public Allocator {
 public:
  explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {}
+  bool IsAllocThreadSafe() const override { return true; }
 protected:
  Allocation *AllocateImpl(size_t size) override;
  void FreeImpl(Allocation *allocation) override;

--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,14 +13,40 @@
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
+class WaitedAllocateSizeGuard {
+ public:
+  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
+                          size_t requested_size)
+      : waited_size_(waited_size), requested_size_(requested_size) {
+    waited_size_->fetch_add(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+  ~WaitedAllocateSizeGuard() {
+    waited_size_->fetch_sub(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+ private:
+  std::atomic<size_t>* waited_size_;
+  size_t requested_size_;
+};
 void RetryAllocator::FreeImpl(Allocation* allocation) {
  // Delete underlying allocation first.
+  size_t size = allocation->size();
  underlying_allocator_->Free(allocation);
-  cv_.notify_all();
+  if (UNLIKELY(waited_allocate_size_)) {
+    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
+                                   "where waited_allocate_size_ = "
+             << waited_allocate_size_;
+    cv_.notify_all();
+  }
 }
 Allocation* RetryAllocator::AllocateImpl(size_t size) {
@@ -31,29 +57,38 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
  // But it would add lock even when allocation success at the first time
  try {
    return alloc_func();
-  } catch (BadAlloc& bad_alloc) {
+  } catch (BadAlloc&) {
    {
+      WaitedAllocateSizeGuard guard(&waited_allocate_size_, size);
+      VLOG(10) << "Allocation failed when allocating " << size
+               << " bytes, waited_allocate_size_ = " << waited_allocate_size_;
      // We can just write allocation retry inside the predicate function of
-      // wait_until
+      // wait_until. But it needs to acquire the lock when executing predicate
-      // But it needs to acquire the lock when executing predicate function
+      // function. For better performance, we use loop here
-      // For better performance, we use loop here
      auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
      auto wait_until = [&, this] {
        std::unique_lock<std::mutex> lock(mutex_);
        return cv_.wait_until(lock, end_time);
      };
+      size_t retry_time = 0;
      while (wait_until() != std::cv_status::timeout) {
        try {
          return alloc_func();
-        } catch (BadAlloc& ex) {
+        } catch (BadAlloc&) {
-          bad_alloc = ex;
+          // do nothing when it is not timeout
+          ++retry_time;
+          VLOG(10) << "Allocation failed when retrying " << retry_time
+                   << " times when allocating " << size
+                   << " bytes. Wait still.";
        } catch (...) {
          throw;
        }
      }
-      throw;  // rethrow the original exception or throw the internal bad_alloc
    }
+    VLOG(10) << "Allocation failed because of timeout when allocating " << size
+             << " bytes.";
+    return alloc_func();  // If timeout, try last allocation request.
  } catch (...) {
    throw;
  }

--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -14,12 +14,14 @@
 #pragma once
+#include <atomic>              // NOLINT
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace memory {
@@ -48,9 +50,7 @@ class RetryAllocator : public Allocator {
  std::mutex mutex_;
  std::condition_variable cv_;
-  // For debug, We can add an atomic integer to record how many memory sizes are
+  std::atomic<size_t> waited_allocate_size_{0};
-  // waited to allocate
-  // std::atomic<size_t> waited_allocate_size_{0};
 };
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -17,12 +17,16 @@
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <mutex>               // NOLINT
-#include <thread>              // NOLINT
+#include <string>
+#include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#endif
 namespace paddle {
 namespace memory {
@@ -93,6 +97,51 @@ TEST(RetryAllocator, RetryAllocator) {
  }
 }
+class DummyAllocator : public Allocator {
+ public:
+  bool IsAllocThreadSafe() const override { return true; }
+ protected:
+  Allocation *AllocateImpl(size_t size) override {
+    PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
+  }
+  void FreeImpl(Allocation *) override {}
+};
+TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
+  size_t retry_ms = 10;
+  {
+    RetryAllocator allocator(std::make_shared<DummyAllocator>(), retry_ms);
+    try {
+      auto allocation = allocator.Allocate(100);
+      ASSERT_TRUE(false);
+      allocation.reset();
+    } catch (BadAlloc &ex) {
+      ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
+                  std::string::npos);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    platform::CUDAPlace p(0);
+    RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
+    size_t allocate_size = -1UL;  // Very large number
+    try {
+      auto allocation = allocator.Allocate(allocate_size);
+      ASSERT_TRUE(false);
+      allocation.reset();
+    } catch (BadAlloc &ex) {
+      ASSERT_TRUE(std::string(ex.what()).find(
+                      "Cannot allocate " + std::to_string(allocate_size) +
+                      " on GPU " + std::to_string(p.device)) !=
+                  std::string::npos);
+    }
+  }
+#endif
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -164,15 +164,6 @@ void BuddyAllocator::Free(void* p) {
           << block->total_size(cache_) << ")";
  pool_.insert(
      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
-  if (FLAGS_free_idle_memory) {
-    // Clean up if existing too much free memory
-    // Prefer freeing fallback allocation first
-    CleanIdleFallBackAlloc();
-    // Free normal allocation
-    CleanIdleNormalAlloc();
-  }
 }
 size_t BuddyAllocator::Used() { return total_used_; }
@@ -225,12 +216,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                     allocate_bytes, nullptr, nullptr);
-  // gpu fallback allocation
-  if (system_allocator_->UseGpu() &&
-      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
-    fallback_alloc_count_++;
-  }
  total_free_ += allocate_bytes;
  // dump the block into pool
@@ -288,70 +273,6 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
  return block;
 }
-void BuddyAllocator::CleanIdleFallBackAlloc() {
-  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_) return;
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-    // If no GPU fallback allocator, return
-    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
-      return;
-    }
-    VLOG(10) << "Return block " << block << " to fallback allocator.";
-    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
-    cache_.invalidate(block);
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-    total_free_ -= block->size(cache_);
-    fallback_alloc_count_--;
-    // If no fall allocation exists, return directly
-    if (!fallback_alloc_count_) return;
-  }
-}
-void BuddyAllocator::CleanIdleNormalAlloc() {
-  auto shall_free_alloc = [&]() -> bool {
-    // free all fallback allocations
-    if (fallback_alloc_count_ > 0) {
-      return true;
-    }
-    // keep 2x overhead if we haven't fallen back
-    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
-      return true;
-    }
-    return false;
-  };
-  if (!shall_free_alloc()) return;
-  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
-    VLOG(10) << "Return block " << block << " to base allocator.";
-    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
-    cache_.invalidate(block);
-    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-    total_free_ -= block->size(cache_);
-    if (!shall_free_alloc()) return;
-  }
-}
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -75,12 +75,6 @@ class BuddyAllocator {
  /*! \brief Find the existing chunk which used to allocation */
  PoolSet::iterator FindExistChunk(size_t size);
-  /*! \brief Clean idle fallback allocation */
-  void CleanIdleFallBackAlloc();
-  /*! \brief Clean idle normal allocation */
-  void CleanIdleNormalAlloc();
 private:
  size_t total_used_ = 0;  // the total size of used memory
  size_t total_free_ = 0;  // the total size of free memory
@@ -98,9 +92,6 @@ class BuddyAllocator {
   */
  PoolSet pool_;
-  /*! Record fallback allocation count for auto-scaling */
-  size_t fallback_alloc_count_ = 0;
 private:
  /*! Unify the metadata format between GPU and CPU allocations */
  MetadataCache cache_;

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <algorithm>  // for std::max
 #include "gflags/gflags.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -117,19 +118,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
    gpu_alloc_size_ += size;
    return p;
  } else {
-    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+    PADDLE_THROW_BAD_ALLOC(
-                 << " MB GPU memory. Please shrink "
+        "Cannot malloc " + std::to_string(size / 1024.0 / 1024.0) +
-                    "FLAGS_fraction_of_gpu_memory_to_use or "
+        " MB GPU memory. Please shrink "
-                    "FLAGS_initial_gpu_memory_in_mb or "
+        "FLAGS_fraction_of_gpu_memory_to_use or "
-                    "FLAGS_reallocate_gpu_memory_in_mb"
+        "FLAGS_initial_gpu_memory_in_mb or "
-                    "environment variable to a lower value. "
+        "FLAGS_reallocate_gpu_memory_in_mb"
-                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+        "environment variable to a lower value. " +
-                 << FLAGS_fraction_of_gpu_memory_to_use
+        "Current FLAGS_fraction_of_gpu_memory_to_use value is " +
-                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
+        std::to_string(FLAGS_fraction_of_gpu_memory_to_use) +
-                 << FLAGS_initial_gpu_memory_in_mb
+        ". Current FLAGS_initial_gpu_memory_in_mb value is " +
-                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
+        std::to_string(FLAGS_initial_gpu_memory_in_mb) +
-                 << FLAGS_reallocate_gpu_memory_in_mb;
+        ". Current FLAGS_reallocate_gpu_memory_in_mb value is " +
-    return nullptr;
+        std::to_string(FLAGS_reallocate_gpu_memory_in_mb));
  }
 }

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -66,25 +66,61 @@ inline std::string demangle(std::string name) {
 inline std::string demangle(std::string name) { return name; }
 #endif
+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  static constexpr int TRACE_STACK_LIMIT = 100;
+  std::ostringstream sout;
+  sout << string::Sprintf("%s at [%s:%d]", std::forward<StrType>(what), file,
+                          line)
+       << std::endl;
+  sout << "PaddlePaddle Call Stacks: " << std::endl;
+#if !defined(_WIN32)
+  void* call_stack[TRACE_STACK_LIMIT];
+  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+  auto symbols = backtrace_symbols(call_stack, size);
+  Dl_info info;
+  for (int i = 0; i < size; ++i) {
+    if (dladdr(call_stack[i], &info) && info.dli_sname) {
+      auto demangled = demangle(info.dli_sname);
+      auto addr_offset = static_cast<char*>(call_stack[i]) -
+                         static_cast<char*>(info.dli_saddr);
+      sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2,
+                              call_stack[i], demangled, addr_offset);
+    } else {
+      sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                              call_stack[i]);
+    }
+  }
+  free(symbols);
+#else
+  sout << "Windows not support stack backtrace yet.";
+#endif
+  return sout.str();
+}
 struct EnforceNotMet : public std::exception {
  std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
+  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
    try {
      std::rethrow_exception(e);
    } catch (std::exception& e) {
-      Init(e.what(), f, l);
+      err_str_ = GetTraceBackString(e.what(), file, line);
+      SaveErrorInformation(err_str_);
    }
  }
-  EnforceNotMet(const std::string& str, const char* f, int l) {
+  EnforceNotMet(const std::string& str, const char* file, int line)
-    Init(str, f, l);
+      : err_str_(GetTraceBackString(str, file, line)) {
+    SaveErrorInformation(err_str_);
  }
  const char* what() const noexcept override { return err_str_.c_str(); }
 private:
-  const std::string output_file_name{"paddle_err_info"};
+  static void SaveErrorInformation(const std::string& err) {
-  void saveErrorInformation(const std::string& err) {
+    const std::string output_file_name{"paddle_err_info"};
    std::stringstream ss;
    ss << output_file_name;
    std::time_t t = std::time(nullptr);
@@ -98,49 +134,15 @@ struct EnforceNotMet : public std::exception {
      err_file.close();
    }
  }
-  template <typename StrType>
-  inline void Init(StrType what, const char* f, int l) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
-    std::ostringstream sout;
-    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
-    sout << "PaddlePaddle Call Stacks: " << std::endl;
-#if !defined(_WIN32)
-    void* call_stack[TRACE_STACK_LIMIT];
-    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-    auto symbols = backtrace_symbols(call_stack, size);
-    Dl_info info;
-    for (int i = 0; i < size; ++i) {
-      if (dladdr(call_stack[i], &info) && info.dli_sname) {
-        auto demangled = demangle(info.dli_sname);
-        auto addr_offset = static_cast<char*>(call_stack[i]) -
-                           static_cast<char*>(info.dli_saddr);
-        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-                                2 + sizeof(void*) * 2, call_stack[i], demangled,
-                                addr_offset);
-      } else {
-        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                                call_stack[i]);
-      }
-    }
-    free(symbols);
-#else
-    sout << "Windows not support stack backtrace yet.";
-#endif
-    err_str_ = sout.str();
-    saveErrorInformation(err_str_);
-  }
 };
 struct EOFException : public std::exception {
  std::string err_str_;
-  EOFException(const char* err_msg, const char* f, int l) {
+  EOFException(const char* err_msg, const char* file, int line) {
-    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
  }
-  const char* what() const noexcept { return err_str_.c_str(); }
+  const char* what() const noexcept override { return err_str_.c_str(); }
 };
 // Because most enforce conditions would evaluate to true, we can use
@@ -329,6 +331,12 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
                                           __LINE__);                          \
  } while (0)
+#define PADDLE_THROW_BAD_ALLOC(...)                                  \
+  do {                                                               \
+    throw ::paddle::memory::allocation::BadAlloc(                    \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
 /*
 * Some enforce helpers here, usage:
 *    int a = 1;