From 58ed412f68049096421db2fa2c87b045877b81a5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 28 Sep 2018 11:16:30 +0800
Subject: [PATCH] refactor(memory): rewrite memory allocation and make it
 extentable

Use OO style to rewrite memory allocation.
---
 .../framework/details/exception_holder.h      |   2 +
 paddle/fluid/framework/executor.cc            |  12 --
 paddle/fluid/framework/lod_tensor.h           |   3 -
 paddle/fluid/framework/mixed_vector.h         |  89 ++------
 paddle/fluid/framework/tensor.cc              |  27 +--
 paddle/fluid/framework/tensor.h               |  59 +-----
 paddle/fluid/framework/tensor_impl.h          |  12 +-
 paddle/fluid/memory/CMakeLists.txt            |   7 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |  43 ++++
 .../memory/allocation/aligned_allocator.cc    |  26 +++
 .../memory/allocation/aligned_allocator.h     |  68 ++++++
 paddle/fluid/memory/allocation/allocator.cc   |  29 +++
 paddle/fluid/memory/allocation/allocator.h    |  93 ++++++++
 .../memory/allocation/allocator_facade.cc     | 102 +++++++++
 .../memory/allocation/allocator_facade.h      |  47 +++++
 .../memory/allocation/best_fit_allocator.cc   | 169 +++++++++++++++
 .../memory/allocation/best_fit_allocator.h    | 132 ++++++++++++
 .../allocation/best_fit_allocator_test.cc     | 144 +++++++++++++
 .../allocation/best_fit_allocator_test.cu     |  88 ++++++++
 .../fluid/memory/allocation/cpu_allocator.cc  |  40 ++++
 .../fluid/memory/allocation/cpu_allocator.h   |  38 ++++
 .../fluid/memory/allocation/cuda_allocator.cc |  69 ++++++
 .../fluid/memory/allocation/cuda_allocator.h  |  45 ++++
 .../memory/allocation/locked_allocator.cc     |  49 +++++
 .../memory/allocation/locked_allocator.h      |  38 ++++
 .../allocation/naive_managed_allocator.cc     |  69 ++++++
 .../allocation/naive_managed_allocator.h      |  71 +++++++
 .../naive_managed_allocator_test.cc           |  80 +++++++
 paddle/fluid/memory/malloc.cc                 | 178 +---------------
 paddle/fluid/memory/malloc.h                  |  90 +-------
 paddle/fluid/memory/malloc_test.cc            | 198 ------------------
 .../detection/generate_proposals_op.cu        |  24 +--
 paddle/fluid/operators/strided_memcpy_test.cc |  20 +-
 paddle/fluid/platform/device_context.cc       |  40 ++--
 paddle/fluid/platform/transform_test.cu       |   9 +-
 paddle/fluid/platform/variant.h               |   1 +
 paddle/testing/paddle_gtest_main.cc           |   9 +-
 python/paddle/fluid/__init__.py               |   8 +-
 38 files changed, 1552 insertions(+), 676 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/CMakeLists.txt
 create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/allocator.h
 create mode 100644 paddle/fluid/memory/allocation/allocator_facade.cc
 create mode 100644 paddle/fluid/memory/allocation/allocator_facade.h
 create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cc
 create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cu
 create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/locked_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/locked_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator.h
 create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator_test.cc
 delete mode 100644 paddle/fluid/memory/malloc_test.cc

diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index c97b364de1e..1b1afce04eb 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -30,6 +30,8 @@ class ExceptionHolder {
       Catch(exp);
     } catch (platform::EnforceNotMet exp) {
       Catch(exp);
+    } catch (std::exception& ex) {
+      LOG(FATAL) << "std::exception caught, " << ex.what();
     } catch (...) {
       LOG(FATAL) << "Unknown exception caught";
     }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8d8042a0563..59389f5c074 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -395,11 +395,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
         if (!erase_tensors.empty()) gc->Add(erase_tensors);
       }
     }
-
-    if (FLAGS_benchmark) {
-      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
-              << memory::memory_usage(place_);
-    }
   }
 
   if (gc != nullptr) {
@@ -421,13 +416,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       scope->DropKids();
     }
   }
-
-  if (FLAGS_benchmark) {
-    VLOG(2) << "-------------------------------------------------------";
-    VLOG(2) << "Memory used after deleting local scope: "
-            << memory::memory_usage(place_);
-    VLOG(2) << "-------------------------------------------------------";
-  }
 }
 
 void Executor::RunPreparedContext(
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index e9b473d5472..fb6e781fd07 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
  public:
   LoDTensor() : Tensor() {}
 
-  /* Constructor with place should only be used in pybind */
-  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
-
   explicit LoDTensor(const LoD& lod) : lod_(lod) {}
 
   void set_lod(const LoD& lod) { lod_ = lod; }
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 77386f4f069..cbaa80dffac 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 
 #include "glog/logging.h"
@@ -31,46 +32,6 @@ namespace paddle {
 namespace framework {
 
 #if defined(PADDLE_WITH_CUDA)
-namespace details {
-struct CUDABuffer {
-  void *data_{nullptr};
-  size_t size_{0};
-  platform::CUDAPlace place_;
-
-  CUDABuffer() {}
-  CUDABuffer(platform::Place place, size_t size)
-      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-    data_ = memory::Alloc(place_, size);
-  }
-
-  ~CUDABuffer() { ClearMemory(); }
-
-  CUDABuffer(const CUDABuffer &o) = delete;
-  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-
-  void Resize(platform::Place place, size_t size) {
-    ClearMemory();
-    place_ = boost::get<platform::CUDAPlace>(place);
-    data_ = memory::Alloc(place_, size);
-    PADDLE_ENFORCE_NOT_NULL(data_);
-    size_ = size;
-  }
-
-  void Swap(CUDABuffer &o) {
-    std::swap(data_, o.data_);
-    std::swap(place_, o.place_);
-    std::swap(size_, o.size_);
-  }
-
- private:
-  void ClearMemory() const {
-    if (data_ != nullptr) {
-      memory::Free(place_, data_);
-    }
-  }
-};
-}  // namespace details
-
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -103,8 +64,6 @@ class Vector {
       o.ImmutableCPU();
       cpu_ = o.cpu_;
       flag_ = kDataInCPU;
-      details::CUDABuffer null;
-      gpu_.Swap(null);
       return *this;
     }
 
@@ -199,7 +158,7 @@ class Vector {
       PADDLE_ENFORCE(platform::is_gpu_place(place),
                      "CUDA Data must on CUDA place");
       ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
+      return reinterpret_cast<T *>(gpu_->ptr());
     }
 
     // get cuda ptr. mutable
@@ -234,13 +193,11 @@ class Vector {
 
     std::mutex &Mutex() const { return mtx_; }
 
-    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
-      if (gpu_.data_ == nullptr) {
-        return nullptr;
-      } else {
-        return std::unique_ptr<platform::CUDAPlace>(
-            new platform::CUDAPlace(gpu_.place_));
-      }
+    boost::optional<platform::CUDAPlace> CUDAPlace() const {
+      return gpu_ == nullptr
+                 ? boost::none
+                 : boost::optional<platform::CUDAPlace>(
+                       boost::get<platform::CUDAPlace>(gpu_->place()));
     }
 
    private:
@@ -254,13 +211,12 @@ class Vector {
     void CopyToCPU() const {
       // COPY GPU Data To CPU
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(
-              platform::Place(gpu_.place_)));
+          platform::DeviceContextPool::Instance().Get(gpu_->place()));
       auto stream = dev_ctx->stream();
-      void *src = gpu_.data_;
+      void *src = gpu_->ptr();
       void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
-                   stream);
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                   gpu_->size(), stream);
       dev_ctx->Wait();
     }
 
@@ -277,8 +233,7 @@ class Vector {
           CopyCPUDataToCUDA(place);
           UnsetFlag(kDirty);
           SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
-                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
           PADDLE_THROW("This situation should not happen");
           // Still dirty
         } else {
@@ -290,7 +245,7 @@ class Vector {
           // Even data is not dirty. However, data is not in CUDA. Copy data.
           CopyCPUDataToCUDA(place);
           SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (!(place == gpu_->place())) {
           PADDLE_THROW("This situation should not happen.");
         } else {
           // Not Dirty && DataInCUDA && Device is same
@@ -301,13 +256,13 @@ class Vector {
 
     void CopyCPUDataToCUDA(const platform::Place &place) const {
       void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
+      gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_->ptr();
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           platform::DeviceContextPool::Instance().Get(place));
       auto stream = dev_ctx->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
-                   stream);
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                   gpu_->size(), stream);
     }
 
     void ImmutableCPU() const {
@@ -329,7 +284,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
+    mutable std::unique_ptr<memory::Allocation> gpu_;
     mutable int flag_;
 
     mutable std::mutex mtx_;
@@ -428,8 +383,8 @@ class Vector {
       auto &mtx = m_.Data().Mutex();
       std::lock_guard<std::mutex> guard(mtx);
       auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+      if (cuda_place == boost::none ||
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
         return m_.Data().CUDAData(place);
       }
     }
@@ -444,8 +399,8 @@ class Vector {
       auto &mtx = m_.Data().Mutex();
       std::lock_guard<std::mutex> guard(mtx);
       auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+      if (cuda_place == boost::none ||
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
         return m_.MutableData()->CUDAMutableData(place);
       }
     }
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index b6ba0df033a..48d300eba95 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -33,9 +33,7 @@ size_t Tensor::memory_size() const {
 
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
                            size_t requested_size) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
+  type_ = type;
   PADDLE_ENFORCE_GE(numel(), 0,
                     "When calling this method, the Tensor's numel must be "
                     "equal or larger than zero. "
@@ -48,25 +46,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
+    holder_ = memory::AllocShared(place, size);
     offset_ = 0;
   }
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -76,7 +56,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
 void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type(), requested_size);
+  return mutable_data(place, type_, requested_size);
 }
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -101,6 +81,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
     Tensor dst;
     dst.holder_ = holder_;
     dst.set_layout(layout_);
+    dst.type_ = type_;
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f1d26854857..232b5a67a0a 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -67,12 +67,7 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : offset_(0) {}
-
-  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place) : offset_(0) {
-    holder_->set_place(place);
-  }
+  Tensor() : type_(typeid(float)), offset_(0) {}
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
@@ -139,7 +134,7 @@ class Tensor {
   std::type_index type() const {
     PADDLE_ENFORCE_NOT_NULL(
         holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
+    return type_;
   }
 
   // memory size returns the holding memory size in byte.
@@ -154,55 +149,9 @@ class Tensor {
   void clear() { holder_ = nullptr; }
 
  private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-    virtual void* ptr() const = 0;
-    virtual size_t size() const = 0;
-    virtual std::type_index type() const = 0;
-    virtual platform::Place place() const = 0;
-    virtual void set_type(std::type_index type) = 0;
-    virtual void set_place(platform::Place place) = 0;
-  };
-
-  template <typename Place>
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-               memory::PODDeleter<uint8_t, Place>(place)),
-          place_(place),
-          size_(size),
-          type_(type) {
-      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
-                              (is_cpu_place(place_) ? "CPU" : "GPU"));
-    }
-
-    virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return type_; }
-    virtual void set_type(std::type_index type) { type_ = type; }
-    virtual void set_place(platform::Place place) { place_ = place; }
-
-    /*! the pointer of memory block. */
-    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
-
-    /*! the place of memory block. */
-    platform::Place place_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    /* the current type of memory */
-    std::type_index type_;
-  };
-
   /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
+  std::shared_ptr<memory::Allocation> holder_;
+  std::type_index type_;
   /**
    * @brief points to elements dimensions.
    *
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 6d3047c95d6..dfa251c02da 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -23,10 +23,10 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
-               holder_->type() == std::type_index(typeid(T));
+  bool valid =
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
   PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -37,10 +37,10 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
-               holder_->type() == std::type_index(typeid(T));
+  bool valid =
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
   PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 709fc7e12e1..bdf8325d150 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -1,15 +1,12 @@
 add_subdirectory(detail)
-
-cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
+add_subdirectory(allocation)
+cc_library(malloc SRCS malloc.cc DEPS allocator_facade)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 
 cc_library(memory
         DEPS
         malloc
         memcpy)
-
-cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
-
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
new file mode 100644
index 00000000000..a932b164401
--- /dev/null
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -0,0 +1,43 @@
+cc_library(allocator SRCS allocator.cc DEPS place)
+cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
+cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
+cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
+nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator gpu_info)
+
+if (WITH_GPU)
+    nv_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+                 best_fit_allocator_test.cu
+            DEPS best_fit_allocator
+                 locked_allocator
+                 cpu_allocator
+                 cuda_allocator
+                 device_context
+                 memcpy)
+else()
+    cc_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+            DEPS best_fit_allocator
+                 locked_allocator
+                 cpu_allocator)
+endif()
+
+
+cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocator)
+cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
+
+if (WITH_GPU)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator)
+else ()
+    set(AllocatorFacadeDeps)
+endif()
+
+cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
+
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS
+        ${AllocatorFacadeDeps}
+        cpu_allocator
+        locked_allocator
+        best_fit_allocator
+        naive_managed_allocator
+        aligned_allocator)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
new file mode 100644
index 00000000000..a805e19bc9f
--- /dev/null
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+ThinAlignedAllocator::ThinAlignedAllocator(
+    std::shared_ptr<ManagedAllocator> underlyning_allocator)
+    : underlying_allocator_(std::move(underlyning_allocator)) {}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
new file mode 100644
index 00000000000..d9eb7870c9b
--- /dev/null
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+template <size_t kAlignment>
+class AlignedAllocation : public Allocation {
+ public:
+  AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
+                    size_t size)
+      : Allocation(AlignedPtr(underlying_allocation->ptr()), size,
+                   underlying_allocation->place()),
+        underlying_allocation_(std::move(underlying_allocation)) {}
+
+ private:
+  static void* AlignedPtr(void* ptr) {
+    auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
+    ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment;
+    return reinterpret_cast<void*>(ptr_addr);
+  }
+
+  std::unique_ptr<Allocation> underlying_allocation_;
+};
+
+class ThinAlignedAllocator : public ManagedAllocator {
+ public:
+  explicit ThinAlignedAllocator(
+      std::shared_ptr<ManagedAllocator> underlyning_allocator);
+
+ protected:
+  std::shared_ptr<ManagedAllocator> underlying_allocator_;
+};
+
+template <size_t kAlignment>
+class AlignedAllocator : public ThinAlignedAllocator {
+ public:
+  using ThinAlignedAllocator::ThinAlignedAllocator;
+  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
+    auto raw_allocation =
+        underlying_allocator_->Allocate(size + kAlignment, attr);
+    return std::unique_ptr<Allocation>(
+        new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
+  }
+  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
+    return std::shared_ptr<Allocation>(Allocate(size, attr).release());
+  }
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
new file mode 100644
index 00000000000..8833b4e1cd6
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+Allocation::~Allocation() {}
+
+Allocator::~Allocator() {}
+
+bool Allocator::IsAllocThreadSafe() const { return false; }
+
+const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
new file mode 100644
index 00000000000..500fc28645b
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class BadAlloc : public std::exception {
+ public:
+  explicit BadAlloc(const std::string& msg) : msg_(msg) {}
+  const char* what() const noexcept override;
+
+ private:
+  std::string msg_;
+};
+
+class Allocation {
+ public:
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : ptr_(ptr), size_(size), place_(place) {}
+
+  Allocation(const Allocation& o) = delete;
+  Allocation& operator=(const Allocation& o) = delete;
+
+  void* ptr() const { return ptr_; }
+
+  size_t size() const { return size_; }
+
+  const platform::Place& place() const { return place_; }
+
+  virtual ~Allocation();
+
+ private:
+  void* ptr_;
+  size_t size_;
+  platform::Place place_;
+};
+
+class Allocator {
+ public:
+  enum Attr {
+    kDefault = 0,
+    kTiny = 1,
+    kFixedHuge = 2,
+    kFluxHuge = 3,
+    kTmp = 4,
+    NumOfAttrs = 5
+  };
+
+  virtual ~Allocator();
+  virtual std::unique_ptr<Allocation> Allocate(
+      size_t size, Allocator::Attr attr = kDefault) = 0;
+
+  virtual bool IsAllocThreadSafe() const;
+};
+
+// User need to invoke `Free` or `FreeUniquePtr` manually if allocated by
+// a manally managed allocator.
+class UnmanagedAllocator : public Allocator {
+ public:
+  virtual void Free(Allocation* allocation) = 0;
+
+  void FreeUniquePtr(std::unique_ptr<Allocation> allocation) {
+    Free(allocation.get());
+  }
+};
+
+// The allocation will be managed by smart pointers
+class ManagedAllocator : public Allocator {
+ public:
+  virtual std::shared_ptr<Allocation> AllocateShared(
+      size_t size, Allocator::Attr attr = kDefault) = 0;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
new file mode 100644
index 00000000000..fc508e75f1c
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include <map>
+#include <vector>
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#endif
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class AllocatorFacadePrivate {
+ public:
+  std::map<platform::Place, std::shared_ptr<ManagedAllocator>> allocators_;
+  std::vector<std::unique_ptr<Allocation>> pre_allocations_;
+  std::vector<std::shared_ptr<Allocator>> holding_allocators_;
+
+  ~AllocatorFacadePrivate() {
+    // Specify destruct order.
+    pre_allocations_.clear();
+    allocators_.clear();
+    holding_allocators_.clear();
+  }
+
+  AllocatorFacadePrivate() {
+    InitCPUAllocator();
+    InitCUDAAllocator();
+  }
+
+ private:
+  void InitCPUAllocator() {
+    auto all = NaiveManagedAllocator::Create(
+        std::unique_ptr<Allocator>(new CPUAllocator()));
+
+    allocators_[platform::CPUPlace()] = all;
+  }
+
+  void InitCUDAAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+      auto cuda_allocator =
+          NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
+              new CUDAAllocator(platform::CUDAPlace(dev_id))));
+
+      auto allocation = cuda_allocator->Allocate(platform::GpuMaxChunkSize());
+      auto allocator = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
+          new LockedAllocator(std::unique_ptr<Allocator>(
+              new BestFitAllocator(allocation.get())))));
+
+      pre_allocations_.emplace_back(std::move(allocation));
+      holding_allocators_.emplace_back(cuda_allocator);
+      allocators_[platform::CUDAPlace(dev_id)] =
+          std::make_shared<AlignedAllocator<64>>(std::move(allocator));
+    }
+#endif
+  }
+};
+
+AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
+AllocatorFacade::~AllocatorFacade() { delete m_; }
+
+AllocatorFacade& AllocatorFacade::Instance() {
+  static AllocatorFacade instance;
+  return instance;
+}
+
+std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+    const platform::Place& place, size_t size, Allocator::Attr attr) {
+  return m_->allocators_[place]->AllocateShared(size, attr);
+}
+
+std::unique_ptr<Allocation> AllocatorFacade::Alloc(const platform::Place& place,
+                                                   size_t size,
+                                                   Allocator::Attr attr) {
+  return m_->allocators_[place]->Allocate(size, attr);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
new file mode 100644
index 00000000000..d780fb6e64b
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class AllocatorFacadePrivate;
+class AllocatorFacade {
+ public:
+  ~AllocatorFacade();
+  AllocatorFacade(const AllocatorFacade& o) = delete;
+  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
+
+  static AllocatorFacade& Instance();
+
+  std::shared_ptr<Allocation> AllocShared(
+      const platform::Place& place, size_t size,
+      Allocator::Attr attr = Allocator::kDefault);
+
+  std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
+                                    Allocator::Attr attr = Allocator::kDefault);
+
+ private:
+  AllocatorFacade();
+  AllocatorFacadePrivate* m_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
new file mode 100644
index 00000000000..aa338f46756
--- /dev/null
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include <bits/stdc++.h>
+#include <list>
+#include <map>
+#include <string>
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+static int HighestBitPos(size_t N) {
+  if (UNLIKELY(N == 0)) {
+    return 0;
+  } else {
+    // NOTE: here we can use __builtin_clz in GCC.
+    // However, let's use std::log2 for better readability
+    // and trust std::log2's performance.
+    return static_cast<int>(std::log2(N) + 1);
+  }
+}
+
+BestFitAllocator::BestFitAllocator(Allocation* allocation)
+    : allocation_(allocation) {
+  details::Chunk chunk;
+  chunk.size_ = allocation_->size();
+  chunk.offset_ = 0;
+  chunk.is_free = true;
+  chunks_.emplace_back(chunk);
+  free_chunks_[HighestBitPos(chunk.size_)].insert(
+      {chunk.size_, chunks_.begin()});
+}
+
+std::unique_ptr<Allocation> BestFitAllocator::Allocate(size_t size, Attr attr) {
+  auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
+  MapIt map_it;
+  for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
+    map_it = free_chunks_[highest_set_bit].lower_bound(size);
+    if (map_it != free_chunks_[highest_set_bit].end()) {
+      break;
+    }
+  }
+  if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
+    throw BadAlloc(string::Sprintf(
+        "Cannot allocate %d, All fragments size is %d", size, FreeSize()));
+  }
+  auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
+  return std::unique_ptr<Allocation>(new BestFitAllocation(this, chunk_it));
+}
+
+size_t BestFitAllocator::FreeSize() const {
+  size_t acc = 0;
+  for (auto& array_item : free_chunks_) {
+    for (auto& pair : array_item) {
+      acc += pair.second->size_;
+    }
+  }
+  return acc;
+}
+
+BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
+                                                      size_t free_chunk_offset,
+                                                      MapIt bin_iterator) {
+  auto to_split_it = bin_iterator->second;
+  free_chunks_[free_chunk_offset].erase(bin_iterator);
+
+  PADDLE_ENFORCE(to_split_it->is_free);
+  PADDLE_ENFORCE_GE(to_split_it->size_, request_size);
+
+  auto remaining_size = to_split_it->size_ - request_size;
+  details::Chunk to_use;
+  details::Chunk remaining;
+  to_use.size_ = request_size;
+  to_use.is_free = false;
+  remaining.size_ = remaining_size;
+  remaining.is_free = true;
+
+  // calc offsets
+  to_use.offset_ = to_split_it->offset_;
+  remaining.offset_ = to_use.offset_ + to_use.size_;
+
+  // insert to chunk list
+  auto to_use_it = chunks_.insert(to_split_it, to_use);
+  if (remaining.size_ != 0) {
+    auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
+    free_chunks_[bit_size].insert(
+        {remaining.size_, chunks_.insert(to_split_it, remaining)});
+  }
+  chunks_.erase(to_split_it);
+  return to_use_it;
+}
+
+void BestFitAllocator::Free(Allocation* allocation) {
+  auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
+  auto chunk_it = bf_allocation->ChunkIterator();
+  PADDLE_ENFORCE(!chunk_it->is_free);
+  chunk_it->is_free = true;
+  if (chunk_it != chunks_.begin()) {
+    auto prev_it = chunk_it;
+    --prev_it;
+
+    if (prev_it->is_free) {
+      // Merge Left.
+      EraseFreeNode(prev_it);
+      prev_it->size_ += chunk_it->size_;
+      chunks_.erase(chunk_it);
+      chunk_it = prev_it;
+    }
+  }
+
+  auto next_it = chunk_it;
+  ++next_it;
+  if (next_it != chunks_.end() && next_it->is_free) {
+    EraseFreeNode(next_it);
+    chunk_it->size_ += next_it->size_;
+    chunks_.erase(next_it);
+  }
+
+  InsertFreeNode(chunk_it);
+}
+
+void BestFitAllocator::InsertFreeNode(const ListIt& it) {
+  auto pos = static_cast<size_t>(HighestBitPos(it->size_));
+  auto& free_map = free_chunks_[pos];
+  free_map.insert({it->size_, it});
+}
+void BestFitAllocator::EraseFreeNode(const ListIt& it) {
+  size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
+  auto& free_map = free_chunks_[pos];
+  auto map_it = free_map.find(it->size_);
+  while (map_it->second != it && map_it != free_map.end()) {
+    ++map_it;
+  }
+  PADDLE_ENFORCE(map_it != free_map.end());
+  free_map.erase(map_it);
+}
+size_t BestFitAllocator::NumFreeChunks() const {
+  size_t num = 0;
+  for (auto& array_item : free_chunks_) {
+    num += array_item.size();
+  }
+  return num;
+}
+
+BestFitAllocation::BestFitAllocation(
+    paddle::memory::allocation::BestFitAllocator* allocator,
+    typename details::ChunkList::iterator chunk_it)
+    : Allocation(reinterpret_cast<void*>(
+                     reinterpret_cast<uintptr_t>(allocator->BasePtr()) +
+                     chunk_it->offset_),
+                 chunk_it->size_, allocator->Place()),
+      allocator_(allocator),
+      chunk_it_(chunk_it) {}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
new file mode 100644
index 00000000000..309a2a77088
--- /dev/null
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <array>
+#include <list>
+#include <map>
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+namespace details {
+struct Chunk {
+  bool is_free{true};
+  // Offset to the base allocation.
+  uintptr_t offset_;
+  size_t size_;
+};
+
+// Here we use std::list to maintain chunk list.
+// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
+// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
+// `Payload`. Such as
+//   *-------*---------------*---------------*--------------*
+//   | Chunk | prev_ pointer | next_ pointer | payload .... |
+//   *-------*---------------*---------------*--------------*
+// This implementation can just return a raw pointer, and we can get the list
+// structure by it. However, we cannot use the same code on GPU since CPU
+// cannot access GPU memory directly.
+//
+// So we choose to use `std::list` and return an allocation instance, which
+// contains the list node iterator, then we can unify CPU/GPU code.
+//
+// To return an allocation is not a bad idea, since Tensor/Vector should holds
+// an allocation instead of raw pointer directly.
+using ChunkList = std::list<Chunk>;
+
+// Here we use a multi-level map of free chunks.
+// the map is
+//      MSB offset --> size --> [ChunkList::iterator]
+//
+// The time complexities:
+//     find a free chunk:
+//          O(logN),
+//               where N is the number of free nodes with the same MSB offset.
+//     find the position of a chunk iterator:
+//          O(logN + K),
+//               where N is the number of free nodes with the same MSB offset.
+//               where K is the number of free nodes with the same size.
+//     insert a free chunk:
+//          O(logN),
+//               where N is the number of free nodes with the same MSB offset.
+//     erase a free chunk:
+//          O(1)
+using FreeChunkBin =
+    std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
+}  // namespace details
+
+class BestFitAllocator;
+
+// The BestFitAllocation maintain the List Node iterator.
+class BestFitAllocation : public Allocation {
+ private:
+  using ListIt = typename details::ChunkList::iterator;
+
+ public:
+  BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);
+
+  const ListIt& ChunkIterator() const { return chunk_it_; }
+
+ private:
+  BestFitAllocator* allocator_;
+  typename details::ChunkList::iterator chunk_it_;
+};
+
+// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
+// safe, we must wrap a locked_allocator. However, we can implement a thread
+// safe allocator by locking each bin and chunks list independently. It will
+// make BestFitAllocator faster in multi-thread situation.
+//
+// This allocator implements a best-fit allocator with merging the free nodes.
+//
+// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
+// is larger than request size, the original block will be split into two
+// chunks. The first block will be used and the second block will be put into
+// free chunks.
+//
+// To free an allocation, it will set the chunk of allocation to free and merge
+// the prev-chunk and the next-chunk when possible.
+class BestFitAllocator : public UnmanagedAllocator {
+ public:
+  explicit BestFitAllocator(Allocation* allocation);
+
+  void* BasePtr() const { return allocation_->ptr(); }
+
+  const platform::Place& Place() const { return allocation_->place(); }
+
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override;
+  void Free(Allocation* allocation) override;
+
+  size_t NumFreeChunks() const;
+
+ private:
+  size_t FreeSize() const;
+  using MapIt = typename details::FreeChunkBin::value_type::iterator;
+  using ListIt = typename details::ChunkList::iterator;
+
+  ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
+                    MapIt bin_iterator);
+  void EraseFreeNode(const ListIt& it);
+  void InsertFreeNode(const ListIt& it);
+
+  Allocation* allocation_;  // not owned
+  details::ChunkList chunks_;
+  details::FreeChunkBin free_chunks_;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
new file mode 100644
index 00000000000..9af903a128d
--- /dev/null
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StubAllocation : public Allocation {
+ public:
+  explicit StubAllocation(size_t size)
+      : Allocation(0, size, platform::CPUPlace()) {}
+};
+
+TEST(BestFitAllocator, test_allocation) {
+  StubAllocation stub(4UL * 1024 * 1024 * 1024);
+  BestFitAllocator allocator(&stub);
+  {
+    auto allocation = allocator.Allocate(64);
+    allocator.FreeUniquePtr(std::move(allocation));
+  }
+
+  {
+    auto allocation = allocator.Allocate(80);
+
+    {
+      auto best_fit_allocation =
+          dynamic_cast<BestFitAllocation*>(allocation.get());
+      ASSERT_NE(best_fit_allocation, nullptr);
+      ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free);
+      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
+      ASSERT_EQ(allocation->size(), 80);
+      ASSERT_EQ(allocation->ptr(), nullptr);
+    }
+
+    auto allocation2 = allocator.Allocate(60);
+    auto allocation3 = allocator.Allocate(90);
+    allocator.FreeUniquePtr(std::move(allocation2));
+    allocation2 = allocator.Allocate(30);
+
+    {
+      auto best_fit_allocation =
+          dynamic_cast<BestFitAllocation*>(allocation2.get());
+      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
+    }
+    allocator.FreeUniquePtr(std::move(allocation2));
+
+    allocation2 = allocator.Allocate(60);
+
+    {
+      auto best_fit_allocation =
+          dynamic_cast<BestFitAllocation*>(allocation2.get());
+      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
+    }
+
+    allocator.FreeUniquePtr(std::move(allocation));
+    allocator.FreeUniquePtr(std::move(allocation2));
+
+    allocation = allocator.Allocate(80 + 60);
+    {
+      auto best_fit_allocation =
+          dynamic_cast<BestFitAllocation*>(allocation.get());
+      ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
+    }
+
+    allocator.FreeUniquePtr(std::move(allocation));
+
+    allocation = allocator.Allocate(80);
+    allocation2 = allocator.Allocate(60);
+    allocator.FreeUniquePtr(std::move(allocation));
+    allocator.FreeUniquePtr(std::move(allocation3));
+    allocator.FreeUniquePtr(std::move(allocation2));
+
+    ASSERT_EQ(allocator.NumFreeChunks(), 1U);
+  }
+}
+
+TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
+  CPUAllocator allocator;
+  auto global_allocation = allocator.Allocate(256UL * 1024 * 1024);
+
+  std::unique_ptr<Allocator> best_fit_allocator(
+      new BestFitAllocator(global_allocation.get()));
+
+  LockedAllocator locked_allocator(std::move(best_fit_allocator));
+
+  auto th_main = [&] {
+    std::random_device dev;
+    std::default_random_engine engine(dev());
+    std::uniform_int_distribution<size_t> dist(1U, 1024U);
+
+    for (size_t i = 0; i < 128; ++i) {
+      size_t allocate_size = dist(engine);
+
+      auto allocation =
+          locked_allocator.Allocate(sizeof(size_t) * allocate_size);
+
+      size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
+
+      for (size_t j = 0; j < allocate_size; ++j) {
+        data[j] = j;
+      }
+      std::this_thread::yield();
+
+      for (size_t j = 0; j < allocate_size; ++j) {
+        ASSERT_EQ(data[j], j);
+      }
+
+      locked_allocator.FreeUniquePtr(std::move(allocation));
+    }
+  };
+  {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < 1024; ++i) {
+      threads.emplace_back(th_main);
+    }
+    for (auto& th : threads) {
+      th.join();
+    }
+  }
+
+  allocator.FreeUniquePtr(std::move(global_allocation));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
new file mode 100644
index 00000000000..a3dcb8b2aef
--- /dev/null
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -0,0 +1,88 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+struct ForEachFill {
+  size_t* ptr_;
+
+  explicit ForEachFill(size_t* ptr) : ptr_(ptr) {}
+
+  __device__ void operator()(size_t i) { ptr_[i] = i; }
+};
+
+TEST(BestFitAllocator, concurrent_cuda) {
+  CUDAAllocator allocator(platform::CUDAPlace(0));
+  // 256 MB
+  auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024);
+  LockedAllocator concurrent_allocator(
+      std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
+
+  auto th_main = [&] {
+    std::random_device dev;
+    std::default_random_engine engine(dev());
+    std::uniform_int_distribution<size_t> dist(1U, 1024U);
+    platform::CUDAPlace gpu(0);
+    platform::CUDADeviceContext dev_ctx(gpu);
+    std::array<size_t, 1024> buf;
+    for (size_t i = 0; i < 128; ++i) {
+      size_t allocate_size = dist(engine);
+
+      auto allocation =
+          concurrent_allocator.Allocate(sizeof(size_t) * allocate_size);
+
+      size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
+
+      ForEachFill fill(data);
+      platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
+                                                                allocate_size);
+      for_range(fill);
+
+      memory::Copy(platform::CPUPlace(), buf.data(), gpu, data,
+                   sizeof(size_t) * allocate_size, dev_ctx.stream());
+
+      dev_ctx.Wait();
+      for (size_t j = 0; j < allocate_size; ++j) {
+        ASSERT_EQ(buf[j], j);
+      }
+
+      concurrent_allocator.FreeUniquePtr(std::move(allocation));
+    }
+  };
+
+  {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < 1024; ++i) {
+      threads.emplace_back(th_main);
+    }
+    for (auto& th : threads) {
+      th.join();
+    }
+  }
+  allocator.FreeUniquePtr(std::move(cuda_allocation));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
new file mode 100644
index 00000000000..3133627bf72
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include <stdlib.h>
+#include <string>
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+std::unique_ptr<Allocation> CPUAllocator::Allocate(size_t size, Attr attr) {
+  void* ptr;
+  auto status = posix_memalign(&ptr, kAlignment, size);
+  if (UNLIKELY(status) != 0) {
+    throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
+                                   size, status));
+  }
+  return std::unique_ptr<Allocation>(new CPUAllocation(ptr, size));
+}
+void CPUAllocator::Free(Allocation* allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation*>(allocation));
+  free(allocation->ptr());
+}
+
+bool CPUAllocator::IsAllocThreadSafe() const { return true; }
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
new file mode 100644
index 00000000000..e3f35685d7e
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CPUAllocation : public Allocation {
+ public:
+  CPUAllocation(void* ptr, size_t size)
+      : Allocation(ptr, size, platform::CPUPlace()) {}
+};
+
+class CPUAllocator : public UnmanagedAllocator {
+ public:
+  constexpr static size_t kAlignment = 64u;
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override;
+  void Free(Allocation* allocation) override;
+  bool IsAllocThreadSafe() const override;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
new file mode 100644
index 00000000000..14e08683321
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <string>
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CUDADeviceGuard {
+ public:
+  explicit CUDADeviceGuard(int dev_id) {
+    int prev_id = platform::GetCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      platform::SetDeviceId(dev_id);
+    }
+  }
+
+  ~CUDADeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetDeviceId(prev_id_);
+    }
+  }
+
+ private:
+  int prev_id_{-1};
+};
+
+std::unique_ptr<Allocation> CUDAAllocator::Allocate(size_t size, Attr attr) {
+  CUDADeviceGuard guard(place_.device);
+  void* ptr;
+  auto status = cudaMalloc(&ptr, size);
+  if (UNLIKELY(status != cudaSuccess)) {
+    throw BadAlloc(string::Sprintf(
+        "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
+        status, cudaGetErrorString(status)));
+  }
+
+  return std::unique_ptr<Allocation>(
+      new CUDAAllocation(ptr, size, platform::Place(place_)));
+}
+
+void CUDAAllocator::Free(Allocation* allocation) {
+  auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
+  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
+                    place_);
+  PADDLE_ENFORCE(cudaFree(allocation->ptr()));
+}
+bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
new file mode 100644
index 00000000000..4bd4c00f976
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Just a flag type.
+class CUDAAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
+class CUDAAllocator : public UnmanagedAllocator {
+ public:
+  explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
+  explicit CUDAAllocator(const platform::Place& place)
+      : place_(boost::get<platform::CUDAPlace>(place)) {}
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override;
+  void Free(Allocation* allocation) override;
+  bool IsAllocThreadSafe() const override;
+
+ private:
+  platform::CUDAPlace place_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
new file mode 100644
index 00000000000..1e0febe10bb
--- /dev/null
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+std::unique_ptr<Allocation> LockedAllocator::Allocate(size_t size, Attr attr) {
+  if (underlying_allocator_->IsAllocThreadSafe()) {
+    return underlying_allocator_->Allocate(size, attr);
+  } else {
+    std::lock_guard<std::mutex> guard(mtx_);
+    return underlying_allocator_->Allocate(size, attr);
+  }
+}
+void LockedAllocator::Free(Allocation *allocation) {
+  if (underlying_allocator_->IsAllocThreadSafe()) {
+    return underlying_allocator_->Free(allocation);
+  } else {
+    std::lock_guard<std::mutex> guard(mtx_);
+    return underlying_allocator_->Free(allocation);
+  }
+}
+bool LockedAllocator::IsAllocThreadSafe() const { return true; }
+
+LockedAllocator::LockedAllocator(
+    std::unique_ptr<Allocator> &&underlying_allocator) {
+  auto *allocator =
+      dynamic_cast<UnmanagedAllocator *>(underlying_allocator.get());
+  PADDLE_ENFORCE_NOT_NULL(allocator);
+  underlying_allocator.release();
+  underlying_allocator_.reset(allocator);
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
new file mode 100644
index 00000000000..eed263f3bc5
--- /dev/null
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <thread>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class LockedAllocator : public UnmanagedAllocator {
+ public:
+  explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override;
+  void Free(Allocation* allocation) override;
+  bool IsAllocThreadSafe() const override;
+
+ private:
+  std::unique_ptr<UnmanagedAllocator> underlying_allocator_;
+  std::mutex mtx_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.cc b/paddle/fluid/memory/allocation/naive_managed_allocator.cc
new file mode 100644
index 00000000000..2a61aee8433
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_managed_allocator.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+NaiveManagedAllocator::NaiveManagedAllocator(
+    std::unique_ptr<Allocator> &&allocator) {
+  auto *underlying_allocator =
+      dynamic_cast<UnmanagedAllocator *>(allocator.get());
+  PADDLE_ENFORCE_NOT_NULL(underlying_allocator);
+  allocator.release();
+  Init(std::unique_ptr<UnmanagedAllocator>(underlying_allocator));
+}
+
+NaiveManagedAllocator::NaiveManagedAllocator(
+    std::unique_ptr<UnmanagedAllocator> &&allocator) {
+  Init(std::move(allocator));
+}
+void NaiveManagedAllocator::Init(
+    std::unique_ptr<UnmanagedAllocator> &&allocator) {
+  underlying_allocator_ = std::move(allocator);
+}
+bool NaiveManagedAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+std::unique_ptr<Allocation> NaiveManagedAllocator::Allocate(size_t size,
+                                                            Attr attr) {
+  std::unique_ptr<Allocation> allocation =
+      underlying_allocator_->Allocate(size, attr);
+  return std::unique_ptr<Allocation>(
+      new NaiveManagedAllocation(std::move(allocation), shared_from_this()));
+}
+std::shared_ptr<Allocation> NaiveManagedAllocator::AllocateShared(size_t size,
+                                                                  Attr attr) {
+  std::unique_ptr<Allocation> allocation =
+      underlying_allocator_->Allocate(size, attr);
+  return std::shared_ptr<Allocation>(
+      new NaiveManagedAllocation(std::move(allocation), shared_from_this()));
+}
+
+NaiveManagedAllocation::~NaiveManagedAllocation() {
+  auto allocator = allocator_.lock();
+  if (UNLIKELY(allocator == nullptr)) {
+    // the allocator is destructed before allocations.
+    // do nothing.
+    return;
+  }
+  // invoke Free
+  allocator->UnderlyingAllocator().FreeUniquePtr(
+      std::move(underlying_allocation_));
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.h b/paddle/fluid/memory/allocation/naive_managed_allocator.h
new file mode 100644
index 00000000000..3291eeaadb6
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_managed_allocator.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NaiveManagedAllocator;
+class NaiveManagedAllocation : public Allocation {
+ public:
+  NaiveManagedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
+                         std::shared_ptr<NaiveManagedAllocator> allocator)
+      : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                   underlying_allocation->place()),
+        underlying_allocation_(std::move(underlying_allocation)),
+        allocator_(allocator) {}
+
+  ~NaiveManagedAllocation() final;
+
+ private:
+  std::unique_ptr<Allocation> underlying_allocation_;
+  std::weak_ptr<NaiveManagedAllocator> allocator_;
+};
+
+class NaiveManagedAllocator
+    : public ManagedAllocator,
+      public std::enable_shared_from_this<NaiveManagedAllocator> {
+ public:
+  template <typename... ARGS>
+  static std::shared_ptr<ManagedAllocator> Create(ARGS... args) {
+    return std::static_pointer_cast<ManagedAllocator>(
+        std::shared_ptr<NaiveManagedAllocator>(
+            new NaiveManagedAllocator(std::move(args)...)));
+  }
+
+  inline UnmanagedAllocator& UnderlyingAllocator() {
+    return *underlying_allocator_;
+  }
+
+  bool IsAllocThreadSafe() const override;
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override;
+  std::shared_ptr<Allocation> AllocateShared(size_t size,
+                                             Attr attr = kDefault) override;
+
+ private:
+  explicit NaiveManagedAllocator(std::unique_ptr<Allocator>&& allocator);
+  explicit NaiveManagedAllocator(
+      std::unique_ptr<UnmanagedAllocator>&& allocator);
+  void Init(std::unique_ptr<UnmanagedAllocator>&& allocator);
+
+  std::unique_ptr<UnmanagedAllocator> underlying_allocator_;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc b/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc
new file mode 100644
index 00000000000..027fdec26de
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
+#include <atomic>  // NOLINT
+#include <random>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StubAllocator : public UnmanagedAllocator {
+ public:
+  std::unique_ptr<Allocation> Allocate(size_t size,
+                                       Attr attr = kDefault) override {
+    counter_.fetch_add(1);
+    return std::unique_ptr<Allocation>(
+        new Allocation(nullptr, size, platform::CPUPlace()));
+  }
+  void Free(Allocation* allocation) override { counter_.fetch_sub(1); }
+  bool IsAllocThreadSafe() const override { return true; }
+
+  std::atomic<int> counter_{0};
+};
+
+TEST(NaiveManagedAllocator, main) {
+  auto allocator = NaiveManagedAllocator::Create(
+      std::unique_ptr<Allocator>(new StubAllocator()));
+
+  auto th_main = [=] {
+    std::random_device dev;
+    std::default_random_engine engine(dev());
+    std::uniform_int_distribution<int> dist(0, 1);
+
+    std::vector<std::shared_ptr<Allocation>> allocations;
+
+    for (int j = 0; j < 1024; ++j) {
+      bool to_insert = static_cast<bool>(dist(engine));
+      if (to_insert) {
+        allocations.emplace_back(allocator->AllocateShared(10));
+      } else {
+        if (!allocations.empty()) {
+          allocations.pop_back();
+        }
+      }
+    }
+  };
+
+  {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < 1024; ++i) {
+      threads.emplace_back(th_main);
+    }
+    for (auto& th : threads) {
+      th.join();
+    }
+  }
+  ASSERT_EQ(reinterpret_cast<StubAllocator&>(
+                std::dynamic_pointer_cast<NaiveManagedAllocator>(allocator)
+                    ->UnderlyingAllocator())
+                .counter_,
+            0);
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 283745e9775..4f289f75379 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -14,13 +14,9 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/memory/malloc.h"
-
 #include "glog/logging.h"
-
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -33,172 +29,14 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 
-using BuddyAllocator = detail::BuddyAllocator;
-
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator* a = nullptr;
-
-  std::call_once(init_flag, []() {
-    a = new detail::BuddyAllocator(
-        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
-  });
-
-  return a;
-}
-
-template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  void* p = GetCPUBuddyAllocator()->Alloc(size);
-  if (FLAGS_init_allocated_mem) {
-    memset(p, 0xEF, size);
-  }
-  VLOG(10) << "  pointer=" << p;
-  return p;
-}
-
-template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetCPUBuddyAllocator()->Free(p);
-}
-
-template <>
-size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
-  return GetCPUBuddyAllocator()->Used();
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator** a_arr = nullptr;
-
-  std::call_once(init_flag, [gpu_id]() {
-    int gpu_num = platform::GetCUDADeviceCount();
-    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
-                   gpu_num);
-
-    a_arr = new BuddyAllocator*[gpu_num];
-    for (int i = 0; i < gpu_num; i++) {
-      a_arr[i] = nullptr;
-      platform::SetDeviceId(i);
-      a_arr[i] = new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
-    }
-  });
-
-  platform::SetDeviceId(gpu_id);
-  return a_arr[gpu_id];
-}
-
-template <>
-size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
+                                        size_t size, Allocator::Attr attr) {
+  return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr);
 }
 
-template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto* ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
-    platform::SetDeviceId(cur_dev);
-  }
-  if (FLAGS_init_allocated_mem) {
-    cudaMemset(ptr, 0xEF, size);
-  }
-  return ptr;
+std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
+                                  Allocator::Attr attr) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
 }
-
-template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
-}
-
-BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static std::once_flag init_flag;
-  static BuddyAllocator* ba = nullptr;
-
-  std::call_once(init_flag, []() {
-    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                new detail::CUDAPinnedAllocator),
-                            platform::CUDAPinnedMinChunkSize(),
-                            platform::CUDAPinnedMaxChunkSize());
-  });
-
-  return ba;
-}
-
-template <>
-size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
-  return GetCUDAPinnedBuddyAllocator()->Used();
-}
-
-template <>
-void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
-                                       size_t size) {
-  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
-  void* ptr = buddy_allocator->Alloc(size);
-
-  if (ptr == nullptr) {
-    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
-                 << " bytes in CUDAPinnedPlace";
-  }
-  if (FLAGS_init_allocated_mem) {
-    memset(ptr, 0xEF, size);
-  }
-  return ptr;
-}
-
-template <>
-void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
-  GetCUDAPinnedBuddyAllocator()->Free(p);
-}
-#endif
-
-size_t Usage::operator()(const platform::CPUPlace& cpu) const {
-  return Used(cpu);
-}
-
-size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(gpu);
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(cuda_pinned);
-#else
-  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
-#endif
-}
-
-size_t memory_usage(const platform::Place& p) {
-  return boost::apply_visitor(Usage(), p);
-}
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 3e6bfddd69c..061ca97dd8e 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -14,91 +14,21 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 namespace memory {
+using allocation::Allocation;
+using allocation::Allocator;
 
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr);
-
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
-};
-
-size_t memory_usage(const platform::Place& p);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+extern std::shared_ptr<Allocation> AllocShared(
+    const platform::Place& place, size_t size,
+    Allocator::Attr attr = Allocator::kDefault);
 
- private:
-  Place place_;
-};
+extern std::unique_ptr<Allocation> Alloc(
+    const platform::Place& place, size_t size,
+    Allocator::Attr attr = Allocator::kDefault);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cc b/paddle/fluid/memory/malloc_test.cc
deleted file mode 100644
index d39466ef60c..00000000000
--- a/paddle/fluid/memory/malloc_test.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-
-#include <unordered_map>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-inline bool is_aligned(void const *p) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
-}
-
-size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
-  size_t alignment = paddle::platform::CpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, CPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CPUPlace cpu;
-  p = paddle::memory::Alloc(cpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = cpu;
-  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(cpu, p);
-}
-
-TEST(BuddyAllocator, CPUMultAlloc) {
-  paddle::platform::CPUPlace cpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(cpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(cpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(size, cpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(cpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, cpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, GPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CUDAPlace gpu(0);
-  p = paddle::memory::Alloc(gpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = gpu;
-  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(gpu, p);
-}
-
-TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::CUDAPlace gpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(gpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(gpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(size, gpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(gpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, gpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-}
-
-size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
-  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
-  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, CUDAPinnedAllocator) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CUDAPinnedPlace cpu;
-  p = paddle::memory::Alloc(cpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = cpu;
-  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(cpu, p);
-}
-
-TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
-  paddle::platform::CUDAPinnedPlace cpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(cpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(cpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(size, cpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(cpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, cpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-}
-#endif
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6146ff509d7..d1d86e561c0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "cub/cub.cuh"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/gather.cu.h"
@@ -57,22 +58,18 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
   T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
 
   // Determine temporary device storage requirements
-  void *d_temp_storage = NULL;
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
-      num);
-
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
   // Allocate temporary storage
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage =
+      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp);
 
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
-      num);
-
-  memory::Free(place, d_temp_storage);
+      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
+      idx_out, num);
 }
 
 template <typename T>
@@ -248,11 +245,12 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
   const T *boxes = proposals.data<T>();
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
   int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
-  uint64_t *d_mask =
-      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
+  auto d_mask_allocation = memory::Alloc(place, size_bytes);
+  uint64_t *d_mask = reinterpret_cast<uint64_t *>(d_mask_allocation->ptr());
   NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
-  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
-      memory::Alloc(platform::CPUPlace(), size_bytes));
+
+  auto h_mask_allocation = memory::Alloc(platform::CPUPlace(), size_bytes);
+  uint64_t *h_mask = reinterpret_cast<uint64_t *>(h_mask_allocation->ptr());
   memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
 
   std::vector<uint64_t> remv(col_blocks);
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index a6ca82d16f2..3a450773a9d 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -87,13 +87,16 @@ TEST(StridedMemcpy, GPUCrop) {
 
   platform::CUDADeviceContext ctx(gpu0);
 
-  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  auto src_allocation = memory::Alloc(gpu0, sizeof(src));
+
+  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
   memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   framework::DDim src_stride({5, 1});
 
   int dst[4];
-  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+  auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
 
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({2, 1});
@@ -108,9 +111,6 @@ TEST(StridedMemcpy, GPUCrop) {
   ASSERT_EQ(2, dst[1]);
   ASSERT_EQ(3, dst[2]);
   ASSERT_EQ(4, dst[3]);
-
-  memory::Free(gpu0, gpu_dst);
-  memory::Free(gpu0, gpu_src);
 }
 
 TEST(StridedMemcpy, GPUConcat) {
@@ -124,12 +124,13 @@ TEST(StridedMemcpy, GPUConcat) {
   platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
   platform::CUDADeviceContext ctx(gpu0);
-
-  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
+  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
   memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   int dst[8];
-  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+  auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
 
   framework::DDim src_stride({2, 1});
   framework::DDim dst_dim({2, 2});
@@ -151,9 +152,6 @@ TEST(StridedMemcpy, GPUConcat) {
   for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
     ASSERT_EQ(expect_dst[i], dst[i]);
   }
-
-  memory::Free(gpu0, gpu_dst);
-  memory::Free(gpu0, gpu_src);
 }
 
 #endif
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index dfc079e986e..0b97f5123a8 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -112,11 +112,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
-    return paddle::memory::Alloc(place_, num_bytes);
+    auto buf =
+        paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny);
+    void* retv = buf->ptr();
+    allocations_[buf->ptr()] = std::move(buf);
+    return retv;
   }
 
   void deallocate(void* buffer) const override {
-    paddle::memory::Free(place_, buffer);
+    allocations_.erase(allocations_.find(buffer));
   }
 
   void* scratchpad() const override {
@@ -143,12 +147,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   const cudaDeviceProp* device_prop_;  // not owned;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
+  mutable std::unordered_map<void*, std::unique_ptr<memory::Allocation>>
+      allocations_;
 };
 
 class CudnnHolder {
  public:
   CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
-      : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
+      : workspace_(nullptr), stream_(stream), place_(place) {
     PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
     PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
   }
@@ -158,36 +164,38 @@ class CudnnHolder {
   void RunFunc(const std::function<void(void*)>& cudnn_func,
                size_t required_workspace_len) {
     std::lock_guard<std::mutex> lock(mtx_);
-    if (required_workspace_len > workspace_len_) {
+    if (required_workspace_len > WorkspaceSize()) {
       ReallocateWorkspace(required_workspace_len);
     }
-    cudnn_func(workspace_);
+    cudnn_func(workspace_->ptr());
   }
 
-  ~CudnnHolder() {
-    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-    if (workspace_ != nullptr) {
-      paddle::memory::Free(place_, workspace_);
+  ~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); }
+
+ private:
+  size_t WorkspaceSize() const {
+    if (workspace_ == nullptr) {
+      return 0;
+    } else {
+      return workspace_->size();
     }
   }
 
- private:
   void ReallocateWorkspace(size_t required_workspace_len) {
-    if (required_workspace_len <= workspace_len_) {
+    if (required_workspace_len <= WorkspaceSize()) {
       return;
     }
     if (workspace_ != nullptr) {
       // Maybe someone is using the current workspace
       PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
-      paddle::memory::Free(place_, workspace_);
+      workspace_.reset();
     }
-    workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
-    workspace_len_ = required_workspace_len;
+    workspace_ = paddle::memory::Alloc(place_, required_workspace_len,
+                                       memory::Allocator::kFluxHuge);
   }
 
   cudnnHandle_t cudnn_handle_;
-  void* workspace_;
-  size_t workspace_len_;
+  std::unique_ptr<memory::Allocation> workspace_;
 
   const cudaStream_t* stream_;  // not owned;
   const CUDAPlace place_;
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index f65d1f60100..07433a151ce 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -39,7 +39,6 @@ class Multiply {
 }  // namespace
 
 using paddle::memory::Alloc;
-using paddle::memory::Free;
 using paddle::memory::Copy;
 
 using paddle::platform::CPUPlace;
@@ -63,13 +62,13 @@ TEST(Transform, GPUUnary) {
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
+  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
+  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
   Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
-  Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
   }
@@ -89,13 +88,13 @@ TEST(Transform, GPUBinary) {
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
-  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
+  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
+  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
   Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
-  Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
   }
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index dc9fad29f28..86c5f87f34d 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -41,4 +41,5 @@ limitations under the License. */
 #include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
+#include <boost/optional.hpp>
 #include <boost/variant.hpp>
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index cfea2059c3c..b18bd70005c 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -27,8 +27,7 @@ int main(int argc, char** argv) {
     new_argv.push_back(argv[i]);
   }
 #ifdef PADDLE_WITH_CUDA
-  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=fraction_of_gpu_memory_to_use"));
 #else
   new_argv.push_back(strdup(
       "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
@@ -37,12 +36,6 @@ int main(int argc, char** argv) {
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
-  paddle::memory::Used(paddle::platform::CPUPlace());
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::memory::Used(paddle::platform::CUDAPlace(0));
-#endif
-
   paddle::framework::InitDevices(true);
   return RUN_ALL_TESTS();
 }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7bbdf7de89c..f0032ab0fae 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -110,10 +110,10 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
+        'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
+        'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
+        'eager_delete_tensor_gb'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
-- 
GitLab