diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 052e1646de68bdbdb803b2ad41f3e44a70859bed..4f07c1610dc3ef45e5bff6df32a71d4af9c55243 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -74,10 +74,24 @@ class CUDAManagedAllocator : public ManagedAllocator {
   explicit CUDAManagedAllocator(int dev_id) {
     platform::CUDADeviceGuard guard(dev_id);
     max_chunk_size_ = platform::GpuMaxChunkSize();
+
     raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
         new CUDAAllocator(platform::CUDAPlace(dev_id))));
-    default_allocator_ = std::make_shared<AutoIncrementAllocator>(
-        [this] { return std::move(BestFitAllocatorCreator()); });
+
+    if (max_chunk_size_ == 0) {
+      default_allocator_ = raw_allocator_;
+    } else {
+      size_t available, total;
+      platform::GpuMemoryUsage(&available, &total);
+      size_t capacity = available / max_chunk_size_;
+
+      if (capacity == 1) {
+        default_allocator_ = BestFitAllocatorCreator();
+      } else {
+        default_allocator_ = std::make_shared<AutoIncrementAllocator>(
+            [this] { return std::move(BestFitAllocatorCreator()); }, capacity);
+      }
+    }
 
     auto* cond_allocator = new ConditionalAllocator();
     cond_allocator
@@ -110,9 +124,11 @@ class CUDAManagedAllocator : public ManagedAllocator {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
     return std::make_shared<AlignedAllocator<64u>>(
-        NaiveManagedAllocator::Create(
-            std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
+        NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
+            new LockedAllocator(std::unique_ptr<Allocator>(
+                new BestFitAllocator(allocation))))));
   }
+
   bool IsAllocThreadSafe() const override { return true; }
 
  private:
diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h
index 650f1d1cc6c268185924bd77e145beda40772dd8..f026c413d4b6afa6f8cbb1ad32a18dc048566dee 100644
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -40,13 +40,18 @@ namespace allocation {
 // allocator. The allocation requests from many threads may be dispatched
 // to the same underlying allocator. So the underlying allocator must be
 // thread safe.
+//
+// NOTE(zjl): Add capacity parameters to constructor. A high-performance
+// thread-safe std::vector with varying size is hard to implement.
+// Fortunately, we can get the total GPU memory and each chunk size.
+// Therefore, we can get the suitable capacity of AutoIncrementAllocator.
 class AutoIncrementAllocator : public ManagedAllocator {
  public:
   // Creator is the method to create ManagedAllocator
   using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>;
 
-  explicit AutoIncrementAllocator(AllocatorCreator&& creator)
-      : creator_(std::move(creator)), prev_success_allocator_{0} {}
+  explicit AutoIncrementAllocator(AllocatorCreator&& creator, size_t capacity)
+      : creator_(std::move(creator)), underlying_allocators_(capacity) {}
   std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
   bool IsAllocThreadSafe() const override;
@@ -56,15 +61,13 @@ class AutoIncrementAllocator : public ManagedAllocator {
   template <typename Callback>
   inline typename std::result_of<Callback(ManagedAllocator&)>::type
   InvokeOrCreateUnderlyingAllocator(Callback callback) {
-    std::shared_ptr<std::vector<AllocatorCreator::result_type>>
-        underlying_allocators = underlying_allocators_;
-    size_t retry_count = underlying_allocators->size();
-    size_t allocator_num = retry_count;
     auto cur = prev_success_allocator_.load();
+    size_t retry_count = allocator_num_.load();
+    size_t allocator_num = retry_count;
     while (retry_count-- > 0) {  // until there retry count is zero
       try {
-        auto res = callback(*((*underlying_allocators)[cur]));
-        prev_success_allocator_.store(cur);
+        auto res = callback(*underlying_allocators_[cur]);
+        prev_success_allocator_ = cur;
         return std::move(res);
       } catch (BadAlloc&) {
         if (++cur >= allocator_num) {
@@ -77,20 +80,34 @@ class AutoIncrementAllocator : public ManagedAllocator {
     }
     // No suitable allocator
 
+    // This happens when the first allocator is exhausted and
+    // there are more than 1 allocation requests
+    // In this situation, the first allocation request would success
+    // and the second allocation request would fail if we do not use
+    // the newly created allocator by the first allocation request.
+    for (size_t new_allocator_num = allocator_num_.load();
+         allocator_num < new_allocator_num; ++allocator_num) {
+      try {
+        auto ret = callback(*underlying_allocators_[allocator_num]);
+        prev_success_allocator_ = allocator_num;
+        return std::move(ret);
+      } catch (BadAlloc&) {
+      } catch (...) {
+        std::rethrow_exception(std::current_exception());
+      }
+    }
+
     ManagedAllocator* new_allocator;
     {
       std::lock_guard<std::mutex> guard(mtx_);
-      auto old_size = underlying_allocators_->size();
-      decltype(underlying_allocators_) new_allocators(
-          new std::vector<AllocatorCreator::result_type>(old_size + 1));
-      for (size_t i = 0; i < old_size; ++i) {
-        (*new_allocators)[i] = (*underlying_allocators_)[i];
-      }
-
-      (*new_allocators)[old_size] = creator_();
-      new_allocator = (*new_allocators)[old_size].get();
-      underlying_allocators_ = new_allocators;
-      prev_success_allocator_.store(old_size);
+      auto old_size = allocator_num_.load();
+      PADDLE_ENFORCE_LT(old_size, underlying_allocators_.size(),
+                        "Allocator number exceeds capacity %d",
+                        underlying_allocators_.size());
+      underlying_allocators_[old_size] = creator_();
+      new_allocator = underlying_allocators_[old_size].get();
+      prev_success_allocator_ = old_size;
+      allocator_num_.fetch_add(1);
     }
 
     PADDLE_ENFORCE(
@@ -102,9 +119,8 @@ class AutoIncrementAllocator : public ManagedAllocator {
 
   AllocatorCreator creator_;
 
-  // Use std::shared_ptr to ensure thread-safety
-  std::shared_ptr<std::vector<AllocatorCreator::result_type>>
-      underlying_allocators_;
+  std::vector<AllocatorCreator::result_type> underlying_allocators_;
+  std::atomic<size_t> allocator_num_{0};
 
   // Use std::atomic rather than std::mutex, since std::atomic is usually
   // lock-free
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index aa338f467562905ffd98846ea71cced3103d6196..1d9e7177f9547af8a083b2304163497b987fac46 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -26,10 +26,11 @@ static int HighestBitPos(size_t N) {
   if (UNLIKELY(N == 0)) {
     return 0;
   } else {
-    // NOTE: here we can use __builtin_clz in GCC.
-    // However, let's use std::log2 for better readability
-    // and trust std::log2's performance.
+#ifdef __GNUC__
+    return sizeof(unsigned int) * 8 - __builtin_clz(N);
+#else
     return static_cast<int>(std::log2(N) + 1);
+#endif
   }
 }