diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 0b9545ad0b3835fe2f6f4b346e20ef0d87facf82..062be5121e2087975d2ae617c1291a92d41b4187 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "../memory/allocation/allocator.h"
 #include "paddle/fluid/framework/data_type.h"
 
 namespace paddle {
@@ -111,8 +112,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   dst->set_layout(src.layout());
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(dst_place, src.type(),
-                                   memory::Allocator::kCommunication);
+  auto dst_ptr =
+      dst->mutable_data(dst_place, src.type(), memory::Allocator::kCrossDevice);
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index a805e19bc9f78da7b6d0baee3fa0d80a2a8de024..98b4b035861fb3bfe1531ecbf780aef395789606 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -21,6 +21,11 @@ namespace allocation {
 ThinAlignedAllocator::ThinAlignedAllocator(
     std::shared_ptr<ManagedAllocator> underlyning_allocator)
     : underlying_allocator_(std::move(underlyning_allocator)) {}
+
+std::shared_ptr<Allocation> ThinAlignedAllocator::AllocateShared(
+    size_t size, Allocator::Attr attr) {
+  return std::shared_ptr<Allocation>(Allocate(size, attr).release());
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index d9eb7870c9b695959492e7edcfaed73795cf4402..3a7868f403e008265ac2fe100193a576962829a0 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -20,34 +20,66 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// The aligned allocation and allocator will wrap a managed allocator,
+// and returns the aligned pointer.
+//
+// NOTE(yy): For speed reason, I just use a template parameter to get
+// alignment, however, it can be an private member if necessary.
+//
+// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
 template <size_t kAlignment>
 class AlignedAllocation : public Allocation {
  public:
   AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
                     size_t size)
-      : Allocation(AlignedPtr(underlying_allocation->ptr()), size,
+      : Allocation(AlignedPtr(underlying_allocation->ptr()),
+                   size + kAlignment - Offset(underlying_allocation->ptr()),
                    underlying_allocation->place()),
         underlying_allocation_(std::move(underlying_allocation)) {}
 
  private:
   static void* AlignedPtr(void* ptr) {
-    auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
-    ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment;
-    return reinterpret_cast<void*>(ptr_addr);
+    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
+                                   Offset(ptr));
+  }
+
+  // Offset to aligned pointer.
+  // if ptr is already aligned, returns 0.
+  static size_t Offset(void* ptr) {
+    auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
+    intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
+    intptr_t diff = aligned_addr - ptr_addr;
+    if (diff == 0) {
+      return 0;
+    } else {
+      return kAlignment + diff;
+    }
   }
 
   std::unique_ptr<Allocation> underlying_allocation_;
 };
 
+// Thin aligned allocator is trivial and used to generate a small size binary.
+//
+// NOTE(yy): This is a trick to make a template class. This class extract the
+// common code into a `thin` class. So if there are multiple specification of
+// the template class, the binary size will not extended too much.
+//
+// NOTE(yy): This could be an over design. If it harms readability of code, it
+// could be removed later.
 class ThinAlignedAllocator : public ManagedAllocator {
  public:
   explicit ThinAlignedAllocator(
       std::shared_ptr<ManagedAllocator> underlyning_allocator);
 
+  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
+
  protected:
   std::shared_ptr<ManagedAllocator> underlying_allocator_;
 };
 
+// An aligned allocator will allocate `size+kAlignment` allocation and adjust
+// the pointer offset.
 template <size_t kAlignment>
 class AlignedAllocator : public ThinAlignedAllocator {
  public:
@@ -58,9 +90,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
     return std::unique_ptr<Allocation>(
         new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
   }
-  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    return std::shared_ptr<Allocation>(Allocate(size, attr).release());
-  }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
index e4d690c296cfe9aa273c9b94688b44ef62bf5e97..b61649e59d326a64aa806460feffc3a910b1cab8 100644
--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
@@ -18,6 +18,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+
+// NOTE(yy): this unittest is not important. It just used for debugging.
+// It can be removed later.
 struct FillZero {
  public:
   float* ptr_;
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 1ee80a3b40e449615bcab10c1e05920215ebda38..e117a2d1537a899e3d0fe990e2aece38c1cfbd63 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -12,6 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <utility>
+
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once
 #include <memory>
 #include <string>
@@ -21,15 +37,22 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
  public:
-  explicit BadAlloc(const std::string& msg) : msg_(msg) {}
+  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
   const char* what() const noexcept override;
 
  private:
   std::string msg_;
 };
 
+// Allocation is the object holding the actually pointer. Use
+// `Allocation::ptr()` will returns the pointer that allocated.
+//
+// NOTE: this is the base class of Allocation. Each allocator can use its own
+//       allocation object.
+// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
 class Allocation {
  public:
   Allocation(void* ptr, size_t size, platform::Place place)
@@ -38,8 +61,22 @@ class Allocation {
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
 
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
   void* ptr() const { return ptr_; }
 
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
   size_t size() const { return size_; }
 
   const platform::Place& place() const { return place_; }
@@ -52,22 +89,51 @@ class Allocation {
   platform::Place place_;
 };
 
+// Base interface class of memory Allocator.
+// To allocate a memory, allocator needs two parameters:
+//    1. size of bytes.
+//    2. Attribute of memory.
+// NOTE: the attribute of memory might be ignored if the allocator does not
+// care it.
 class Allocator {
  public:
   enum Attr {
-    kDefault = 0,
-    kTiny = 1,
-    kFixedHuge = 2,
-    kFluxHuge = 3,
-    kTmp = 4,
-    kCommunication = 5,
-    NumOfAttrs = 6
+    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
+                   // algorithm.
+
+    kFixedHuge = 1,  // The allocation may not be freed until the program
+                     // ends. e.g., `Parameters` and `Momentum`.
+
+    kFluxHuge = 2,  // The allocation may create and freed frequently and the
+                    // allocation is considerable huge. Like `activations`
+                    // and gradients.
+
+    kScratchpad =
+        3,  // The `Scratchpad` memory is allocated and freed very soon,
+            // usually within an operator or aux memory.
+            // Like CUDNN workspace, AUX memory in batch norm, etc.
+            //
+            // https://en.wikipedia.org/wiki/Scratchpad_memory
+
+    kCrossDevice =
+        4,  // The memory used cross-device memory copy/communication.
+            // For example:
+            // 1. it can use an `pinned` memory for CPU-GPU
+            //    communication.
+            // 2. it can use an `registered` memory for RDMA
+            //    communication.
+
+    NumOfAttrs = 5  // The number of all attributes. It is used internally.
   };
 
   virtual ~Allocator();
+
+  // Allocate an allocation. Note the return allocation might need to be freed
+  // manually if the Allocator is an `UnmanagedAllocator`.
   virtual std::unique_ptr<Allocation> Allocate(
       size_t size, Allocator::Attr attr = kDefault) = 0;
 
+  // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 };
 
@@ -82,7 +148,8 @@ class UnmanagedAllocator : public Allocator {
   }
 };
 
-// The allocation will be managed by smart pointers
+// The allocation will be managed by smart pointers. i.e., users do not need
+// to free allocation manually.
 class ManagedAllocator : public Allocator {
  public:
   virtual std::shared_ptr<Allocation> AllocateShared(
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 7816aec8f788e536293f77f21cbc22ba7bcbebd5..052e1646de68bdbdb803b2ad41f3e44a70859bed 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -46,7 +46,7 @@ class CPUManagedAllocator : public ManagedAllocator {
             std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
 
   std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
-    if (attr == kCommunication) {
+    if (attr == kCrossDevice) {
       return communication_allocator_->Allocate(size, attr);
     } else {
       return normal_allocator_->Allocate(size, attr);
@@ -54,7 +54,7 @@ class CPUManagedAllocator : public ManagedAllocator {
   }
 
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
-    if (attr == kCommunication) {
+    if (attr == kCrossDevice) {
       return communication_allocator_->AllocateShared(size, attr);
     } else {
       return normal_allocator_->AllocateShared(size, attr);
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index a910e40badb80476ecd3d7761f44fc1d79b73982..c03d59a3f3c16f916a7c23e86fa6e3a2abf05efe 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -24,6 +24,10 @@ namespace allocation {
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.
+//
+// NOTE(yy): This class is a singleton class.
+// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
+// a Pimpl trick;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
  public:
@@ -33,13 +37,16 @@ class AllocatorFacade {
 
   static AllocatorFacade& Instance();
 
+  // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(
       const platform::Place& place, size_t size,
       Allocator::Attr attr = Allocator::kDefault);
 
+  // Allocate a unique allocation.
   std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
                                     Allocator::Attr attr = Allocator::kDefault);
 
+  // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h
index 9fe370b08a79ff6bf9e359ffa78b45e02ce8c89b..116d4ca6892fdceba0dda6da8a5c6039ac24ebb5 100644
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -24,12 +24,27 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// The AutoIncrementAllocator manages many underlying allocators. If none of
+// them can allocate the request memory, a new allocator will be created and
+// invoke its `allocate` method.
+//
+// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
+// the latest sucessful allocator.
+//
+// NOTE(yy): We may need to release an underlying allocator if it allocate
+// nothing. However, it is generally not useful, since it will make performance
+// undetermined.
+//
+// NOTE(yy): This allocator is only locked when creating new underlying
+// allocator. The allocation requests from many threads may be dispatched
+// to the same underlying allocator. So the underlying allocator must be
+// thread safe.
 class AutoIncrementAllocator : public ManagedAllocator {
  public:
+  // Creator is the method to create ManagedAllocator
   using AllocatorCreator = std::function<std::shared_ptr<ManagedAllocator>()>;
 
-  template <typename Creator>
-  explicit AutoIncrementAllocator(Creator&& creator)
+  explicit AutoIncrementAllocator(AllocatorCreator&& creator)
       : creator_(std::move(creator)), prev_success_allocator_{0} {}
   std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
@@ -65,6 +80,11 @@ class AutoIncrementAllocator : public ManagedAllocator {
       std::lock_guard<std::mutex> guard(mtx_);
       underlying_allocators_.emplace_back(creator_());
       prev_success_allocator_ = underlying_allocators_.size() - 1;
+      PADDLE_ENFORCE(
+          underlying_allocators_[prev_success_allocator_]->IsAllocThreadSafe(),
+          "the underlying allocator must be thread safe. This is a program "
+          "bug.");
+
       return callback(*underlying_allocators_[prev_success_allocator_]);
     }
   }
diff --git a/paddle/fluid/memory/allocation/conditional_allocator.h b/paddle/fluid/memory/allocation/conditional_allocator.h
index f993857c79400115ecef29af164877ba830c50b3..46af1099a5c9328aa1ec5e92e6655f277cdd8e93 100644
--- a/paddle/fluid/memory/allocation/conditional_allocator.h
+++ b/paddle/fluid/memory/allocation/conditional_allocator.h
@@ -22,6 +22,22 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// A composite allocator who will dispatch the allocation request by registered
+// condition.
+//
+// For example:
+//
+// auto* cond_allocator = new ConditionalAllocator();
+// cond_allocator->AddAllocator([](size_t size, Attr attr){
+//   // if size > 10
+//   return size > 10;
+// }, allocator_a).AddAllocator([](size_t size, Attr attr){
+//   // elif attr is kDefault
+//   return attr == kDefault;
+// }, allocator_b).AddAllocator([](size_t size, Attr attr){
+//   // else
+//   return true;
+// }, allocator_c);
 class ConditionalAllocator : public ManagedAllocator {
  public:
   ConditionalAllocator() = default;
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index e3f35685d7e89372ecd7cc0373e0cc2dffd755dc..b2df77f1227c658e6ba83075fbc0f46340305334 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -18,7 +18,13 @@
 namespace paddle {
 namespace memory {
 namespace allocation {
-
+// CPU system allocator and allocation.
+//
+// NOTE(yy): Should we just use `malloc` here since there is an
+// aligned_allocator.
+//
+// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
+// an open-sourced allocator into Paddle.
 class CPUAllocation : public Allocation {
  public:
   CPUAllocation(void* ptr, size_t size)
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 4bd4c00f976ba7058766d982879814783807085b..dea01e60890741877a387e5588fae8703dd202ac 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// CUDA System allocator and allocation.
 // Just a flag type.
 class CUDAAllocation : public Allocation {
  public:
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index eed263f3bc50c2e2974ea1b1497c3b67c51ed7de..f092a5bad007ee6d1081c22ffacdbb6190bd4a73 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// A allocator to make underlying allocator thread safe.
 class LockedAllocator : public UnmanagedAllocator {
  public:
   explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);
diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.h b/paddle/fluid/memory/allocation/naive_managed_allocator.h
index 3291eeaadb6fcd916290bdde2488c4388117df00..7a4cfdb662a3b6481b1f0071621588ac07471e84 100644
--- a/paddle/fluid/memory/allocation/naive_managed_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_managed_allocator.h
@@ -20,6 +20,11 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// An allocator to wrap an UnmanagedAllocator and make the allocation managed
+// by C++ smart ptr.
+//
+// NOTE: if the NaiveManagedAllocator is destroyed before
+// NaiveManagedAllocations, the allocation will never be released.
 class NaiveManagedAllocator;
 class NaiveManagedAllocation : public Allocation {
  public:
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 39f4b78421592d9916db192ffc0be1b2b59c7dfc..dd1f5a3dd0f70b0603b4bd89db54c2fa9373740b 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -23,7 +23,7 @@ namespace allocation {
 std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
                                                          Allocator::Attr attr) {
   PADDLE_ENFORCE_EQ(
-      attr, kCommunication,
+      attr, kCrossDevice,
       "CPUPinnedAllocator should be used for Cross-Device Communication");
 
   void* ptr;
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index eb249192dd016dcd1405f65083e9310d32172a57..2c9e09cd721bea969fceaa307bcf3ef93be6568c 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -19,6 +19,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// Allocator uses `cudaMallocHost`
 class CPUPinnedAllocation : public Allocation {
  public:
   CPUPinnedAllocation(void* ptr, size_t size)
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
index 62e14b633cc3f36b25f72f3073f1fd9091d6a4e2..35a4552469f2d3c50fc0df8fab50d99c6bdc08f5 100644
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// The allocator handles the request's size is zero. Allocator will always
+// return an allocation even the request size is zero. However, the
+// allocation.ptr() is nullptr
 class ZeroSizeAllocation : public Allocation {
  public:
   explicit ZeroSizeAllocation(const platform::Place& p)
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 3b9303b7e35696bec7d6cc098873162a55736c4b..0d3817c3e7c43bf7772f16ebc45841ec5ef9a62a 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
 #include <string>
 #include <vector>
@@ -70,7 +71,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Allocate temporary storage
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
   auto d_temp_storage =
-      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp);
+      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);
 
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 80ffc680c2a26426dea2e96a5e285abe38543c79..6b1d5e297dd3b5a41d63e61d02afd367859d1431 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -112,8 +112,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
-    auto buf =
-        paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny);
+    auto buf = paddle::memory::Alloc(place_, num_bytes,
+                                     memory::Allocator::kScratchpad);
     void* retv = buf->ptr();
     allocations_[buf->ptr()] = std::move(buf);
     return retv;
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 1b95ec66bd514e23001ea5e06b6b867749e16eb8..e55f734e45b6b40741386979324ac88b1b989fa1 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -64,7 +64,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace(),
-            memory::Allocator::kCommunication));
+            memory::Allocator::kCrossDevice));
 
         paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
                                         sizeof(CUR_TYPE) * tensor.numel(),