From c6189637cd7f680501edc0593bb3875700f3da05 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 23 May 2019 18:44:18 +0800
Subject: [PATCH] Fix allocator bug (#16712)

* Revert "Revert "Fix allocator bug""

This reverts commit 174d0d0b90a610807d6f82927aad4def227ee643.

* Revert "fix travis ci"

This reverts commit 5656fa9f7ca278aff7319485c0d289a4ffc2f9d0.

test=develop

* add inlined_vector.h, test=develop

* add inlined_vector_test,test=develop
---
 paddle/fluid/framework/CMakeLists.txt         |   2 +
 paddle/fluid/framework/inlined_vector.h       |  69 ++++++++++
 paddle/fluid/framework/inlined_vector_test.cc |  82 ++++++++++++
 paddle/fluid/framework/operator.h             |   3 -
 paddle/fluid/memory/allocation/CMakeLists.txt |  24 ++--
 .../memory/allocation/aligned_allocator.h     |   2 +
 paddle/fluid/memory/allocation/allocator.cc   |  17 +--
 paddle/fluid/memory/allocation/allocator.h    | 119 ++++++++++++++----
 .../memory/allocation/allocator_facade.cc     |  48 ++++---
 .../memory/allocation/allocator_strategy.cc   |  14 ++-
 .../memory/allocation/best_fit_allocator.cc   |   2 +-
 .../memory/allocation/best_fit_allocator.h    |   2 +-
 .../memory/allocation/buffered_allocator.cc   |  22 ++--
 .../memory/allocation/buffered_allocator.h    |   6 +-
 .../allocation/buffered_allocator_test.cc     |   3 +-
 .../fluid/memory/allocation/cpu_allocator.cc  |  28 +++--
 .../fluid/memory/allocation/cpu_allocator.h   |  10 +-
 .../fluid/memory/allocation/cuda_allocator.cc |  10 +-
 .../fluid/memory/allocation/cuda_allocator.h  |   9 +-
 .../memory/allocation/legacy_allocator.cc     |   2 +-
 .../memory/allocation/legacy_allocator.h      |   2 +-
 .../memory/allocation/locked_allocator.cc     |  19 ++-
 .../memory/allocation/locked_allocator.h      |   6 +-
 .../naive_best_fit_allocator_facade_test.cc   |  91 ++++++++++++++
 .../memory/allocation/pinned_allocator.cc     |   9 +-
 .../memory/allocation/pinned_allocator.h      |   8 +-
 .../memory/allocation/retry_allocator.cc      |  18 +--
 .../fluid/memory/allocation/retry_allocator.h |  23 ++--
 .../memory/allocation/zero_size_allocator.cc  |  11 +-
 .../memory/allocation/zero_size_allocator.h   |   7 +-
 paddle/fluid/platform/temporary_allocator.cc  |  28 ++---
 paddle/fluid/platform/temporary_allocator.h   |  14 +--
 paddle/fluid/pybind/pybind.cc                 |   1 +
 paddle/fluid/string/printf.h                  |   6 +-
 34 files changed, 484 insertions(+), 233 deletions(-)
 create mode 100644 paddle/fluid/framework/inlined_vector.h
 create mode 100644 paddle/fluid/framework/inlined_vector_test.cc
 create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 4e00630bb..c41efc5e0 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -225,6 +225,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(tuple_test SRCS tuple_test.cc )
 
+cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
+
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
diff --git a/paddle/fluid/framework/inlined_vector.h b/paddle/fluid/framework/inlined_vector.h
new file mode 100644
index 000000000..2a7f26b9f
--- /dev/null
+++ b/paddle/fluid/framework/inlined_vector.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstdint>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T, size_t N>
+class InlinedVector {
+  static_assert(N > 0, "N must be larger than 0");
+
+ public:
+  inline InlinedVector() { len_ = 0; }
+
+  inline size_t size() const { return len_; }
+
+  inline T& operator[](size_t i) { return i < N ? head_[i] : tail_[i - N]; }
+
+  inline const T& operator[](size_t i) const {
+    return i < N ? head_[i] : tail_[i - N];
+  }
+
+  inline void emplace_back(const T& item) {
+    if (LIKELY(len_ < N)) {
+      head_[len_++] = item;
+    } else {
+      tail_.emplace_back(item);
+      ++len_;
+    }
+  }
+
+  inline void pop_back() {
+    if (UNLIKELY(len_ > N)) {
+      tail_.pop_back();
+    }
+    --len_;
+  }
+
+  inline T& back() {
+    if (LIKELY(len_ <= N)) {
+      return head_[len_ - 1];
+    } else {
+      return tail_.back();
+    }
+  }
+
+ private:
+  T head_[N];
+  size_t len_;
+  std::vector<T> tail_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc
new file mode 100644
index 000000000..003c0d7bb
--- /dev/null
+++ b/paddle/fluid/framework/inlined_vector_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/inlined_vector.h"
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T, size_t N>
+static std::vector<T> ToStdVector(const framework::InlinedVector<T, N> &vec) {
+  std::vector<T> std_vec;
+  std_vec.reserve(vec.size());
+  for (size_t i = 0; i < vec.size(); ++i) {
+    std_vec.emplace_back(vec[i]);
+  }
+  return std_vec;
+}
+
+template <size_t N>
+void InlinedVectorCheck(size_t n) {
+  std::srand(std::time(nullptr));
+
+  std::vector<int> std_vec;
+  framework::InlinedVector<int, N> vec;
+
+  for (size_t i = 0; i < n; ++i) {
+    int value = rand();  // NOLINT
+
+    std_vec.emplace_back(value);
+    vec.emplace_back(value);
+
+    CHECK_EQ(std_vec.size(), vec.size());
+    CHECK_EQ(std_vec.back(), vec.back());
+
+    CHECK_EQ(vec.back(), value);
+  }
+
+  bool is_equal = (std_vec == ToStdVector(vec));
+
+  CHECK_EQ(is_equal, true);
+
+  for (size_t i = 0; i < n; ++i) {
+    CHECK_EQ(std_vec.size(), vec.size());
+    CHECK_EQ(std_vec.back(), vec.back());
+    std_vec.pop_back();
+    vec.pop_back();
+    CHECK_EQ(std_vec.size(), vec.size());
+  }
+
+  CHECK_EQ(std_vec.size(), static_cast<size_t>(0));
+  CHECK_EQ(vec.size(), static_cast<size_t>(0));
+}
+
+TEST(inlined_vector, inlined_vector) {
+  for (size_t i = 0; i < 20; ++i) {
+    InlinedVectorCheck<1>(i);
+    InlinedVectorCheck<10>(i);
+    InlinedVectorCheck<15>(i);
+    InlinedVectorCheck<20>(i);
+    InlinedVectorCheck<21>(i);
+    InlinedVectorCheck<25>(i);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0c8ab533f..f216f3949 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -366,9 +366,6 @@ class ExecutionContext {
     auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
         allocation_ptr, deleter);
 
-    PADDLE_ENFORCE(
-        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
-        "The AllocationPtr must be TemporaryAllocation.");
     PADDLE_ENFORCE_GE(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 3dbbea3dd..4c4ae72ef 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,6 +4,7 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
@@ -37,30 +38,19 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS
-        ${AllocatorFacadeDeps}
-        cpu_allocator
-        locked_allocator
-        best_fit_allocator
-        aligned_allocator
-        auto_increment_allocator
-        zero_size_allocator
-        conditional_allocator
-        retry_allocator
-        buffered_allocator
-        allocator_strategy
-        legacy_allocator
-        )
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
-cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
+cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
 
+cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
   set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 064acd06e..b536d4276 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -94,6 +94,8 @@ class AlignedAllocator : public ThinAlignedAllocator {
         underlying_allocator_->Allocate(size + kAlignment, attr);
     return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
   }
+
+  void FreeImpl(Allocation* allocation) override { delete allocation; }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 8fb8a5fb8..dc0f34fec 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -19,24 +19,11 @@
 namespace paddle {
 namespace memory {
 namespace allocation {
-Allocation::~Allocation() {}
-
-Allocator::~Allocator() {}
 
 bool Allocator::IsAllocThreadSafe() const { return false; }
 
-AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
-  auto ptr = AllocateImpl(size, attr);
-  ptr->set_allocator(this);
-  return AllocationPtr(ptr);
-}
-
-void Allocator::Free(Allocation* allocation) { delete allocation; }
-
-const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
-
-void AllocationDeleter::operator()(Allocation* allocation) const {
-  auto* allocator = allocation->allocator();
+void Allocator::FreeImpl(Allocation* allocation) {
+  Allocator* allocator = allocation->TopDecoratedAllocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 346527893..5acdd9d0f 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -15,8 +15,10 @@
 #pragma once
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -26,40 +28,73 @@ namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
  public:
-  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
-  const char* what() const noexcept override;
+  inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
+
+  inline const char* what() const noexcept override { return msg_.c_str(); }
 
  private:
   std::string msg_;
 };
 
-class Allocation;
-class AllocationDeleter {
- public:
-  void operator()(Allocation* allocation) const;
-};
-
 class Allocator;
+
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
 //
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
+
+/**
+ * Allocation is returned by Allocator::Allocate() method.
+ *
+ * An allocator may be decorated by another allocator. For example, we can
+ * decorate a RetryAllocator to any allocator to perform allocation retry when
+ * first allocation request fails.
+ *
+ * Explanations of Allocator design is as follows:
+ *
+ * Suppose we have an allocator which is decorated by several allocators:
+ *
+ *   A(1) <- A(2) <- A(3) <- ... <- A(n)
+ *
+ * , and the public allocator is A(1).
+ *
+ * The allocation process would be:
+ *
+ *   A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
+ *
+ * , and the free process would be:
+ *
+ *   A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
+ *
+ * Therefore, we should record the allocator chain when allocating, so
+ * that we can free the allocation in the reverse order of allocator chain.
+ * The field `decorated_allocators_` is used to record this chain.
+ *
+ * Another example is that we want to add additional fields in Allocation,
+ * e.g., something what is done in AlignedAllocator, etc.
+ * In this case, we should declare a derived class of Allocation, which
+ * contains an underlying Allocation allocated by the underlying allocator.
+ * Therefore, `decorated_allocators_` of the new Allocation object would
+ * be a new chain, differing from the underlying Allocation object.
+ */
 class Allocation {
  public:
-  Allocation(void* ptr, size_t size, platform::Place place)
-      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
+  inline Allocation(void* ptr, size_t size, platform::Place place)
+      : ptr_(ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
+  Allocation(Allocation&& o) = delete;
+  Allocation& operator=(Allocation&& o) = delete;
 
   // Returns the holding pointer.
   // NOTE: For performance consideration, it is better not to make this method
   // as a virtual method. If we want to implement a `defragmentation` later,
   // we might need to make `ptr_` field as a protected field, and add a virtual
   // method like `defragmentation` to change `ptr_`.
-  void* ptr() const { return ptr_; }
+  inline void* ptr() const { return ptr_; }
 
   // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
   // last valid element.
@@ -70,24 +105,38 @@ class Allocation {
   //    The raw pointer might not aligned, so an offset might be added to raw
   //    the pointer. The size of this allocation will be
   //    `size + kAlignemnt - offset`.
-  size_t size() const { return size_; }
+  inline size_t size() const { return size_; }
+
+  inline const platform::Place& place() const { return place_; }
 
-  const platform::Place& place() const { return place_; }
+  virtual ~Allocation() {}
 
-  Allocator* allocator() { return allocator_; }
+ private:
+  inline void RegisterDecoratedAllocator(Allocator* allocator) {
+    decorated_allocators_.emplace_back(allocator);
+  }
 
-  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
+  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
 
-  virtual ~Allocation();
+  inline Allocator* TopDecoratedAllocator() {
+    return decorated_allocators_.back();
+  }
 
  private:
-  Allocator* allocator_;
   void* ptr_;
   size_t size_;
   platform::Place place_;
-};
 
-using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  // NOTE(zjl): Since decorated_allocators_ is usually a small vector
+  // We reserve a small buffer to it to prevent frequent heap allocation
+  static constexpr size_t kReserveAllocatorNum = 8;
+  using DecoratedAllocatorStack =
+      framework::InlinedVector<Allocator*, kReserveAllocatorNum>;
+
+  DecoratedAllocatorStack decorated_allocators_;
+
+  friend class Allocator;
+};
 
 // Base interface class of memory Allocator.
 // To allocate a memory, allocator needs two parameters:
@@ -126,22 +175,42 @@ class Allocator {
     NumOfAttrs = 5  // The number of all attributes. It is used internally.
   };
 
-  virtual ~Allocator();
+  virtual ~Allocator() {}
+
+  class AllocationDeleter {
+   public:
+    inline void operator()(Allocation* allocation) const {
+      Allocator* allocator = allocation->TopDecoratedAllocator();
+      allocator->Free(allocation);
+    }
+  };
+
+  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
 
   // Allocate an allocation.
-  AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
+  inline AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault) {
+    auto ptr = AllocateImpl(size, attr);
+    ptr->RegisterDecoratedAllocator(this);
+    return AllocationPtr(ptr);
+  }
+
+  // This function should not be called outside Allocator class
+  inline void Free(Allocation* allocation) {
+    allocation->PopDecoratedAllocator();
+    FreeImpl(allocation);
+  }
 
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
  protected:
-  virtual void Free(Allocation* allocation);
   virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
-
- private:
-  friend class AllocationDeleter;
+  virtual void FreeImpl(Allocation* allocation);
 };
 
+using AllocationDeleter = Allocator::AllocationDeleter;
+using AllocationPtr = Allocator::AllocationPtr;
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a3b73e3ba..09328aded 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -49,6 +49,17 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+static inline std::shared_ptr<Allocator> WrapRetryAllocator(
+    std::shared_ptr<Allocator> allocator, int64_t retry_time) {
+  if (retry_time > 0) {
+    auto* retry_allocator =
+        new RetryAllocator(std::move(allocator), retry_time);
+    allocator.reset(retry_allocator);
+  }
+
+  return allocator;
+}
+
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
  public:
@@ -112,14 +123,10 @@ class ChunkedAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    std::unique_ptr<Allocator> allocator(new LockedAllocator(
-        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
+    std::shared_ptr<Allocator> allocator(new LockedAllocator(
+        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    if (retry_time_ > 0) {
-      auto* retry_allocator =
-          new RetryAllocator(std::move(allocator), retry_time_);
-      allocator.reset(retry_allocator);
-    }
+    allocator = WrapRetryAllocator(allocator, retry_time_);
 
     return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
   }
@@ -190,13 +197,23 @@ class AllocatorFacadePrivate {
   ~AllocatorFacadePrivate() = default;
 
   AllocatorFacadePrivate() {
-    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
-      InitLegacyAllocator();
-    } else {
-      InitCPUAllocator();
-      InitCUDAAllocator();
-      InitCUDAPinnedAllocator();
-      WrapZeroSizeAllocator();
+    auto strategy = GetAllocatorStrategy();
+    switch (strategy) {
+      case AllocatorStrategy::kLegacy: {
+        InitLegacyAllocator();
+        break;
+      }
+      case AllocatorStrategy::kNaiveBestFit: {
+        InitCPUAllocator();
+        InitCUDAAllocator();
+        InitCUDAPinnedAllocator();
+        WrapZeroSizeAllocator();
+        break;
+      }
+      default: {
+        PADDLE_THROW("Unsupported allocator strategy: %d",
+                     static_cast<int>(strategy));
+      }
     }
   }
 
@@ -254,8 +271,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
-                                     AllocationDeleter());
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 8cebda900..fff94c01e 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,16 +19,22 @@
 DEFINE_string(
     allocator_strategy, "legacy",
     "The allocation strategy. Legacy means the original allocator of Fluid."
-    "New means the experimental allocators of Fluid. in [legacy, new]");
+    "naive_best_fit means the experimental best fit allocator. "
+    "allocator. Enum in [legacy, naive_best_fit].");
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 static AllocatorStrategy GetStrategyFromFlag() {
-  return FLAGS_allocator_strategy == "legacy"
-             ? AllocatorStrategy::kLegacy
-             : AllocatorStrategy::kNaiveBestFit;
+  if (FLAGS_allocator_strategy == "legacy") {
+    return AllocatorStrategy::kLegacy;
+  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
+    return AllocatorStrategy::kNaiveBestFit;
+  } else {
+    PADDLE_THROW("Unsupported allocator strategy: %s",
+                 FLAGS_allocator_strategy);
+  }
 }
 
 AllocatorStrategy GetAllocatorStrategy() {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index e3d6c2f51..d87dd9a4b 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::Free(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(bf_allocation,
                           "The input allocation is not BestFitAllocation.");
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 4f10f2b53..c137438c0 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index fc75abc9d..e04c0aa34 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
+BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
     : underlying_allocator_(std::move(allocator)) {
   PADDLE_ENFORCE_NOT_NULL(
       underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must be unmanaged");
+      "Underlying allocator of BufferedAllocator must not be null");
   if (underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
   while (!allocations_.empty()) {  // free the largest
     auto it = --allocations_.end();
     cur += it->second->size();
-    delete it->second.release();
+    underlying_allocator_->Free(it->second.release());
     allocations_.erase(it);
     if (cur >= size) return;
   }
 }
 
-bool BufferedAllocator::IsAllocThreadSafe() const {
-  return this->underlying_allocator_->IsAllocThreadSafe();
-}
-void BufferedAllocator::Free(Allocation *allocation) {
+bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
+
+void BufferedAllocator::FreeImpl(Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   allocations_.emplace(allocation->size(), AllocationPtr(allocation));
 }
+
 Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
@@ -61,17 +61,15 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
     if (it != allocations_.end() && it->first < size * 2) {
       AllocationPtr result(std::move(it->second));
       allocations_.erase(it);
-      return new AllocationWithUnderlying(std::move(result));
+      return result.release();
     }
   }
 
   try {
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   } catch (BadAlloc &) {
     FreeCache(size);
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index d44a3f85b..c72839570 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -31,7 +31,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
+  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
 
   ~BufferedAllocator();
 
@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::multimap<size_t, AllocationPtr> allocations_;
   std::unique_ptr<std::mutex> mtx_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index c8bd5292c..854a117b0 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
-#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -66,7 +65,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void Free(Allocation *allocation) override {
+  void FreeImpl(Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(alloc);
     if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index cc81a6f7b..90c49c87a 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -20,25 +20,27 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-CPUAllocation::CPUAllocation(void *ptr, size_t size)
-    : Allocation(ptr, size, platform::CPUPlace()) {}
-
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::Free(Allocation *allocation) {
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
-  free(allocation->ptr());
+void CPUAllocator::FreeImpl(Allocation *allocation) {
+  void *p = allocation->ptr();
+#ifdef _WIN32
+  _aligned_free(p);
+#else
+  free(p);
+#endif
   delete allocation;
 }
 
 Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  void *ptr;
-  auto status = posix_memalign(&ptr, kAlignment, size);
-  if (UNLIKELY(status) != 0) {
-    throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
-                                   size, status));
-  }
-  return new CPUAllocation(ptr, size);
+  void *p;
+#ifdef _WIN32
+  p = _aligned_malloc(size, kAlignment);
+#else
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
+                    size);
+#endif
+  return new Allocation(p, size, platform::CPUPlace());
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 26d3643f4..3eb1416b0 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -31,19 +31,13 @@ namespace allocation {
 //
 // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
 // an open-sourced allocator into Paddle.
-class CPUAllocator;
-class CPUAllocation : public Allocation {
- public:
-  CPUAllocation(void* ptr, size_t size);
-};
-
 class CPUAllocator : public Allocator {
  public:
-  constexpr static size_t kAlignment = 64u;
+  constexpr static size_t kAlignment = 4096UL;
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 430bf0be9..895a24a6a 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,15 +23,14 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::Free(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(Allocation* allocation) {
   platform::CUDADeviceGuard guard(place_.device);
-  auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
+  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
                     place_);
   PADDLE_ENFORCE(cudaFree(allocation->ptr()));
   delete allocation;
 }
+
 Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
@@ -41,8 +40,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
         "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
         status, cudaGetErrorString(status)));
   }
-  return new CUDAAllocation(ptr, size, platform::Place(place_));
+  return new Allocation(ptr, size, platform::Place(place_));
 }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 63726f582..580a2d1df 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,13 +20,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-// CUDA System allocator and allocation.
-// Just a flag type.
-class CUDAAllocation : public Allocation {
- public:
-  using Allocation::Allocation;
-};
-
 class CUDAAllocator : public Allocator {
  public:
   explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
@@ -35,7 +28,7 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 2ecb44ff1..0f4a55ced 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -347,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   return tmp_alloc;
 }
 
-void LegacyAllocator::Free(Allocation *allocation) {
+void LegacyAllocator::FreeImpl(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index d9bdae153..27cd42ea3 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
 
  protected:
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 62d768c58..c43099cc8 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -17,6 +17,7 @@
 #include <utility>
 #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,26 +25,24 @@ namespace allocation {
 bool LockedAllocator::IsAllocThreadSafe() const { return true; }
 
 LockedAllocator::LockedAllocator(
-    std::unique_ptr<Allocator> &&underlying_allocator)
+    std::shared_ptr<Allocator> underlying_allocator)
     : underlying_allocator_(std::move(underlying_allocator)) {
   PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
   if (!underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
 }
-void LockedAllocator::Free(Allocation *allocation) {
-  {
-    platform::LockGuardPtr<std::mutex> guard(mtx_);
-    reinterpret_cast<AllocationWithUnderlying *>(allocation)
-        ->allocation_.reset();  // Destroy inner allocation
-  }
-  delete allocation;
+
+void LockedAllocator::FreeImpl(Allocation *allocation) {
+  platform::LockGuardPtr<std::mutex> guard(mtx_);
+  underlying_allocator_->Free(allocation);
 }
+
 Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return new AllocationWithUnderlying(
-      underlying_allocator_->Allocate(size, attr));
+  return underlying_allocator_->Allocate(size, attr).release();
 }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 4967b9bb8..b735ccef1 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -24,15 +24,15 @@ namespace allocation {
 // A allocator to make underlying allocator thread safe.
 class LockedAllocator : public Allocator {
  public:
-  explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
+  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
new file mode 100644
index 000000000..3334589a4
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+DECLARE_string(allocator_strategy);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  FLAGS_allocator_strategy = "naive_best_fit";
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index de81d12cc..5a3d81721 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,20 +20,15 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::Free(Allocation *allocation) {
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
+void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
   PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
   delete allocation;
 }
 Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
                                              Allocator::Attr attr) {
-  // PADDLE_ENFORCE_EQ(
-  //    attr, kCrossDevice,
-  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
-
   void *ptr;
   PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
-  return new CPUPinnedAllocation(ptr, size);
+  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 42d0938f2..deeb55a8f 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -20,18 +20,12 @@ namespace memory {
 namespace allocation {
 
 // Allocator uses `cudaHostAlloc`
-class CPUPinnedAllocation : public Allocation {
- public:
-  CPUPinnedAllocation(void *ptr, size_t size)
-      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
-};
-
 class CPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void Free(Allocation *allocation) override;
+  void FreeImpl(Allocation *allocation) override;
   Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 981705051..7e888988f 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -18,25 +18,15 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-bool RetryAllocator::IsAllocThreadSafe() const {
-  return underlying_allocator_->IsAllocThreadSafe();
-}
-
-void RetryAllocator::Free(Allocation* allocation) {
+void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
-  reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
-  {
-    // notify all waited allocators, they can try to allocate memory after free.
-    std::lock_guard<std::mutex> lock(mutex_);
-    cv_.notify_all();
-  }
-  delete allocation;
+  underlying_allocator_->Free(allocation);
+  cv_.notify_all();
 }
 
 Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto alloc_func = [&, this]() {
-    return new AllocationWithUnderlying(
-        underlying_allocator_->Allocate(size, attr));
+    return underlying_allocator_->Allocate(size, attr).release();
   };
   // In fact, we can unify the code of allocation success and failure
   // But it would add lock even when allocation success at the first time
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 6ab8ca8fb..379f576d6 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -25,32 +25,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-class RetryAllocator;
-
 class RetryAllocator : public Allocator {
  public:
-  RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
+  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
       : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
-    EnforceCheck();
-  }
-
-  bool IsAllocThreadSafe() const override;
-
- private:
-  void EnforceCheck() {
     PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_.get(),
-        "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
+        underlying_allocator_,
+        "UnderlyingAllocator of RetryAllocator must not be null");
     PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
                    "UnderlyingAllocator of RetryAllocator must be thread-safe");
   }
 
+  bool IsAllocThreadSafe() const override { return true; }
+
  protected:
-  void Free(Allocation* allocation) override;
+  void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::unique_ptr<Allocator> underlying_allocator_;
+  std::shared_ptr<Allocator> underlying_allocator_;
   std::chrono::milliseconds retry_time_;
   std::mutex mutex_;
   std::condition_variable cv_;
@@ -58,8 +51,6 @@ class RetryAllocator : public Allocator {
   // For debug, We can add an atomic integer to record how many memory sizes are
   // waited to allocate
   // std::atomic<size_t> waited_allocate_size_{0};
-
-  friend class RetryAllocation;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
index cb2df1a02..39743bcb1 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
@@ -24,11 +24,20 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
 
 Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   if (size == 0) {
-    return new ZeroSizeAllocation(place_);
+    return new Allocation(nullptr, 0, place_);
   } else {
     return underlying_allocator_->Allocate(size, attr).release();
   }
 }
+
+void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
+  if (allocation->size() == 0) {
+    delete allocation;
+  } else {
+    underlying_allocator_->Free(allocation);
+  }
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index 0f01dfcdf..08a7a06db 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -24,12 +24,6 @@ namespace allocation {
 // The allocator handles the request's size is zero. Allocator will always
 // return an allocation even the request size is zero. However, the
 // allocation.ptr() is nullptr
-class ZeroSizeAllocation : public Allocation {
- public:
-  explicit ZeroSizeAllocation(const platform::Place& p)
-      : Allocation(nullptr, 0, p) {}
-};
-
 class ZeroSizeAllocator : public Allocator {
  public:
   ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
@@ -40,6 +34,7 @@ class ZeroSizeAllocator : public Allocator {
 
  protected:
   Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
+  void FreeImpl(Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 03b581484..fe2f528be 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <memory>
-#include <utility>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 DEFINE_int64(limit_of_tmp_allocation, -1,
@@ -31,38 +30,31 @@ namespace paddle {
 namespace platform {
 namespace alloc = memory::allocation;
 
-TemporaryAllocation::TemporaryAllocation(
-    alloc::AllocationPtr &&underlying_allocation)
-    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
-                 underlying_allocation->place()),
-      underlying_allocation_(std::move(underlying_allocation)) {}
-
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
     t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
     wait_delete_mem_ = 0;
   }
 
+  alloc::AllocationDeleter deleter;
   for (auto tmp : *t_allocations) {
     VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
              << " size: " << tmp.second->size();
-    delete tmp.second;
+    deleter(tmp.second);
   }
 }
 
-void TemporaryAllocator::Free(alloc::Allocation *allocation) {
-  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
+void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
   if (platform::is_gpu_place(temp_allocation->place())) {
     PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
                    "The place should be the same.");
@@ -86,7 +78,7 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
   }
   VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
            << " size: " << temp_allocation->size();
-  delete temp_allocation;
+  alloc::AllocationDeleter()(temp_allocation);
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
@@ -121,11 +113,9 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(
   }
   // If not find the the available allocation, get allocation from
   // AllocatorFacadeInstance.
-  auto raw_allocation =
-      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
-  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
+  auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
   VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
-  return temp_mem;
+  return temp_mem.release();
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index f8a43b889..912d45eaf 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -23,14 +23,6 @@
 namespace paddle {
 namespace platform {
 
-class TemporaryAllocation : public memory::allocation::Allocation {
- public:
-  explicit TemporaryAllocation(
-      memory::allocation::AllocationPtr &&underlying_allocation);
-
-  memory::allocation::AllocationPtr underlying_allocation_;
-};
-
 /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
  * which used by CUDA's async operation.
  *
@@ -57,7 +49,7 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   void SetCallback(const std::function<void()> &callback);
 
  protected:
-  void Free(memory::allocation::Allocation *allocation) override;
+  void FreeImpl(memory::allocation::Allocation *allocation) override;
 
   memory::allocation::Allocation *AllocateImpl(
       size_t size, memory::allocation::Allocator::Attr attr) override;
@@ -66,8 +58,8 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   platform::Place place_;
   // When the allocation is not held by any variable, it should be placed
   // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
-      nullptr};
+  std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
+      temp_mem_map_{nullptr};
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f3f5f7cc7..e0eaefad6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -357,6 +357,7 @@ PYBIND11_MODULE(core, m) {
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_clear", &Tensor::clear)
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 16bb3771f..66b768665 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
-template <typename T>
-std::string HumanReadableSize(T size) {
+inline std::string HumanReadableSize(double f_size) {
   size_t i = 0;
-  double f_size = static_cast<double>(size);
   double orig = f_size;
   const std::vector<std::string> units(
       {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size > 1024) {
+  while (f_size >= 1024) {
     f_size /= 1024;
     i++;
   }
-- 
GitLab