diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 7cdac0de6138f13325500759c0ca2a392e2000f9..0f725a454c8448ff354fd119099f09a9eaeda9dc 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -840,6 +840,28 @@ void* AllocatorFacade::GetBasePtr(
   return m_->GetBasePtr(allocation);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place, const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place,
+                              /* A non-zero num to choose allocator_ */ 1);
+    }
+#endif
+    return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
+  }
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+#endif
+
+const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
+    const platform::Place& place) {
+  return m_->GetAllocator(place, /* zero size */ 0);
+}
+
 std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
   return std::shared_ptr<pten::Allocation>(Alloc(place, size));
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index a9b92e1801e4a3c74941388f864172f078d7128a..f4aea98003abe077d34415604980a2f165a3fd83 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -53,6 +53,14 @@ class AllocatorFacade {
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
+                                                 const gpuStream_t& stream);
+#endif
+
+  const std::shared_ptr<Allocator>& GetZeroAllocator(
+      const platform::Place& place);
+
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 142e30d161ccadf3c3cb55eee430597e60d50624..fdd9883c2c9244c8b7a4dc9d623974b36d43dd0d 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #endif
 #include "glog/logging.h"
 #include "paddle/fluid/framework/expect.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -136,11 +137,39 @@ inline void EmplaceDeviceContext(
         map_ptr,
     platform::Place p) {
   using PtrType = std::unique_ptr<DeviceContext>;
-  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
-                     // lazy evaluation. i.e., only create device context at
-                     // first `Get`
-                     return PtrType(new DevCtx(p));
-                   }));
+  map_ptr->emplace(
+      p, std::async(std::launch::deferred, [=] {
+        // lazy evaluation. i.e., only create device context at
+        // first `Get`
+        auto* dev_ctx = new DevCtx(p);
+        if (is_gpu_place(p)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+          auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
+          PADDLE_ENFORCE_NOT_NULL(
+              cuda_ctx,
+              platform::errors::InvalidArgument(
+                  "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
+          dev_ctx->SetDeviceAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetAllocator(p, cuda_ctx->context()->RawStream())
+                  .get());
+#endif
+        } else {
+          dev_ctx->SetDeviceAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetAllocator(p)
+                  .get());
+        }
+        dev_ctx->SetHostAllocator(
+            memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(platform::CPUPlace())
+                .get());
+        dev_ctx->SetZeroAllocator(
+            memory::allocation::AllocatorFacade::Instance()
+                .GetZeroAllocator(p)
+                .get());
+        return PtrType(dev_ctx);
+      }));
 }
 
 DeviceContextPool::DeviceContextPool(
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 15f9f0bda3c25e2b8a4125d1025d8b0a673f2dc5..7373ba79c0a5cfeb537ecda7ec318e533e23b555 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -68,6 +68,45 @@ bool DenseTensor::IsSharedWith(const DenseTensor& b) const {
   return holder_ && holder_ == b.Holder();
 }
 
+void* DenseTensor::AllocateFrom(Allocator* allocator,
+                                DataType dtype,
+                                size_t requested_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      allocator,
+      paddle::platform::errors::InvalidArgument(
+          "Required allocator shall not be nullptr, but received nullptr."));
+  if (this->dtype() != dtype) {
+    VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype;
+    meta_.dtype = dtype;
+  }
+  PADDLE_ENFORCE(
+      valid(),
+      paddle::platform::errors::PreconditionNotMet(
+          "The meta data must be valid when call the mutable data function."));
+  size_t bytes = numel() * SizeOf(this->dtype());
+  if (requested_size) {
+    PADDLE_ENFORCE_GE(requested_size,
+                      bytes,
+                      paddle::platform::errors::InvalidArgument(
+                          "The reserved size %d should be enough to meet the "
+                          "volume required by metadata %d.",
+                          requested_size,
+                          bytes));
+    bytes = requested_size;
+  }
+  // TODO(paddle-dev): In case of the allocator of storage_ is different with
+  // the incoming allocator, we should re-alloc data using the incoming
+  // allocator.
+  if (!holder_ || holder_->size() < bytes + meta_.offset) {
+    meta_.offset = 0;
+    VLOG(10) << "Allocate data with bytes: " << bytes;
+    ResetHolder(allocator->Allocate(bytes));
+  }
+
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 meta_.offset);
+}
+
 template <typename T>
 const T* DenseTensor::data() const {
   check_memory_size();
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 2823441f97da2a784d6fb175429a0496e50d6aaa..fbecbcf0a1f2682abc87c7dbf605ead291fdf8fb 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -124,6 +124,12 @@ class DenseTensor : public TensorBase,
   /// return Whether the storage is allocated.
   bool initialized() const override { return holder_ && holder_->ptr(); }
 
+  /// \brief Allocate memory with requested size from allocator.
+  /// \return The mutable data pointer value of type T.
+  void* AllocateFrom(Allocator* allocator,
+                     DataType dtype,
+                     size_t requested_size = 0) override;
+
   /// \brief Check if storage is shared with other objects.
   /// \return Whether the storage is shared with other objects.
   bool IsSharedWith(const DenseTensor& b) const;
diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc
index 7566b351bf63401acba3bad247b10bd7bb3c9cf1..d6e01c5c6e66494b71d5d7fad763bb5dd7b5b138 100644
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -13,45 +13,119 @@
 // limitations under the License.
 
 #include "paddle/pten/core/device_context.h"
-#include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/tensor_base.h"
 
 namespace pten {
+using DataType = paddle::experimental::DataType;
 
 struct DeviceContext::Impl {
   Impl() = default;
   ~Impl() = default;
 
-  void SetDeviceAllocator(Allocator* allocator) {
+  void SetDeviceAllocator(const Allocator* allocator) {
+    PADDLE_ENFORCE_NOT_NULL(
+        allocator,
+        pten::errors::InvalidArgument(
+            "Required allocator shall not be nullptr, but received nullptr."));
     device_allocator_ = allocator;
   }
 
-  void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; }
+  void SetHostAllocator(const Allocator* allocator) {
+    PADDLE_ENFORCE_NOT_NULL(
+        allocator,
+        pten::errors::InvalidArgument(
+            "Required allocator shall not be nullptr, but received nullptr."));
+    host_allocator_ = allocator;
+  }
+
+  void SetZeroAllocator(const Allocator* allocator) {
+    PADDLE_ENFORCE_NOT_NULL(
+        allocator,
+        pten::errors::InvalidArgument(
+            "Required allocator shall not be nullptr, but received nullptr."));
+    zero_allocator_ = allocator;
+  }
 
   const Allocator& GetDeviceAllocator() const {
-    PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(
+        device_allocator_,
+        pten::errors::InvalidArgument("Required device_allocator_ shall not be "
+                                      "nullptr, but received nullptr."));
     return *device_allocator_;
   }
 
   const Allocator& GetHostAllocator() const {
-    PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(
+        host_allocator_,
+        pten::errors::InvalidArgument("Required host_allocator_ shall not be "
+                                      "nullptr, but received nullptr."));
     return *host_allocator_;
   }
 
-  // TODO(Wilber): Add impl. It seems that tensorbase not have interface to
-  // communicate with allocator.
-  void HostAlloc(TensorBase* tensor) {}
-  void DeviceAlloc(TensorBase* tensor) {}
+  const Allocator& GetZeroAllocator() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        zero_allocator_,
+        pten::errors::InvalidArgument("Required host_allocator_ shall not be "
+                                      "nullptr, but received nullptr."));
+    return *zero_allocator_;
+  }
+
+  void* Alloc(TensorBase* tensor,
+              DataType dtype = DataType::UNDEFINED,
+              size_t requested_size = 0) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        pten::errors::InvalidArgument(
+            "Required tensor shall not be nullptr, but received nullptr."));
+    if (dtype == DataType::UNDEFINED) {
+      dtype = tensor->dtype();
+    }
+    auto* allocator =
+        tensor->numel() == 0 ? zero_allocator_ : device_allocator_;
+    return tensor->AllocateFrom(
+        const_cast<Allocator*>(allocator), dtype, requested_size);
+  }
+
+  template <typename T>
+  T* Alloc(TensorBase* tensor, size_t requested_size = 0) const {
+    DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
+    return static_cast<T*>(Alloc(tensor, dtype, requested_size));
+  }
 
-  Allocator* device_allocator_{nullptr};
-  Allocator* host_allocator_{nullptr};
+  void* HostAlloc(TensorBase* tensor,
+                  DataType dtype = DataType::UNDEFINED,
+                  size_t requested_size = 0) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        pten::errors::InvalidArgument(
+            "Required tensor shall not be nullptr, but received nullptr."));
+    if (dtype == DataType::UNDEFINED) {
+      dtype = tensor->dtype();
+    }
+    auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
+    return tensor->AllocateFrom(
+        const_cast<Allocator*>(allocator), dtype, requested_size);
+  }
+
+  template <typename T>
+  T* HostAlloc(pten::TensorBase* tensor, size_t requested_size = 0) const {
+    DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
+    return static_cast<T*>(HostAlloc(tensor, dtype, requested_size));
+  }
+
+ private:
+  const Allocator* device_allocator_{nullptr};
+  const Allocator* host_allocator_{nullptr};
+  const Allocator* zero_allocator_{nullptr};
 };
 
 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
 
 DeviceContext::DeviceContext(const DeviceContext& other) {
-  impl_->SetDeviceAllocator(
-      const_cast<Allocator*>(&other.GetDeviceAllocator()));
-  impl_->SetHostAllocator(const_cast<Allocator*>(&other.GetHostAllocator()));
+  impl_->SetHostAllocator(&other.GetHostAllocator());
+  impl_->SetDeviceAllocator(&other.GetDeviceAllocator());
+  impl_->SetZeroAllocator(&other.GetZeroAllocator());
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -60,26 +134,71 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
 
 DeviceContext::~DeviceContext() = default;
 
-void DeviceContext::SetHostAllocator(Allocator* allocator) {
-  impl_->SetHostAllocator(allocator);
+void DeviceContext::SetDeviceAllocator(const Allocator* allocator) {
+  impl_->SetDeviceAllocator(allocator);
 }
 
-void DeviceContext::SetDeviceAllocator(Allocator* allocator) {
-  impl_->SetDeviceAllocator(allocator);
+const Allocator& DeviceContext::GetDeviceAllocator() const {
+  return impl_->GetDeviceAllocator();
+}
+
+void DeviceContext::SetHostAllocator(const Allocator* allocator) {
+  impl_->SetHostAllocator(allocator);
 }
 
 const Allocator& DeviceContext::GetHostAllocator() const {
   return impl_->GetHostAllocator();
 }
 
-const Allocator& DeviceContext::GetDeviceAllocator() const {
-  return impl_->GetDeviceAllocator();
+void DeviceContext::SetZeroAllocator(const Allocator* allocator) {
+  impl_->SetZeroAllocator(allocator);
 }
 
-void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); }
+const Allocator& DeviceContext::GetZeroAllocator() const {
+  return impl_->GetZeroAllocator();
+}
 
-void DeviceContext::DeviceAlloc(TensorBase* tensor) {
-  impl_->DeviceAlloc(tensor);
+void* DeviceContext::Alloc(TensorBase* tensor,
+                           DataType dtype,
+                           size_t requested_size) const {
+  return impl_->Alloc(tensor, dtype, requested_size);
 }
 
+template <typename T>
+T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const {
+  return impl_->Alloc<T>(tensor, requested_size);
+}
+
+void* DeviceContext::HostAlloc(TensorBase* tensor,
+                               DataType dtype,
+                               size_t requested_size) const {
+  return impl_->HostAlloc(tensor, dtype, requested_size);
+}
+
+template <typename T>
+T* DeviceContext::HostAlloc(TensorBase* tensor, size_t requested_size) const {
+  return impl_->HostAlloc<T>(tensor, requested_size);
+}
+
+#define DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(dtype)              \
+  template dtype* DeviceContext::Alloc(TensorBase* tensor,           \
+                                       size_t requested_size) const; \
+  template dtype* DeviceContext::HostAlloc(TensorBase* tensor,       \
+                                           size_t requested_size) const;
+
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(bool)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int8_t)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(uint8_t)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int16_t)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int32_t)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(int64_t)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(float)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(double)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::bfloat16)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::float16)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64)
+DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
+
+#undef DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION
+
 }  // namespace pten
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index c658a24c3527d50efacc9b2b768ac8f07c07b338..30be5cd22dd4e07980d939beca34fad4d9aa00e3 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -19,6 +19,7 @@ limitations under the License. */
 // TODO(wilber): Do we need to use place in pten kernel?
 #include "paddle/pten/common/place.h"
 
+#include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/allocator.h"
 
 namespace pten {
@@ -31,6 +32,8 @@ class TensorBase;
  * DeviceContext.
  */
 class DeviceContext {
+  using DataType = paddle::experimental::DataType;
+
  public:
   /**
    * @brief Default construct.
@@ -53,42 +56,61 @@ class DeviceContext {
   virtual ~DeviceContext();
 
   /**
-   * @brief Set the deveice-releated Allocator object.
+   * @brief Set the device-related Allocator object.
    *
    * @param allocator
    */
-  void SetDeviceAllocator(Allocator*);
+  void SetDeviceAllocator(const Allocator*);
 
   /**
-   * @brief Get the const deveice-releated Allocator object.
+   * @brief Set the host Allocator object.
    *
-   * @return Allocator
+   * @param allocator
    */
-  const Allocator& GetDeviceAllocator() const;
+  void SetHostAllocator(const Allocator*);
 
   /**
-   * @brief Allocate device memory for tensor.
-   */
-  void DeviceAlloc(pten::TensorBase*);
+  * @brief Set the zero-size Allocator object.
+  *
+  * @param allocator
+  */
+  void SetZeroAllocator(const Allocator*);
 
   /**
-   * @brief Set the host Allocator object.
+   * @brief Get the const Allocator object.
    *
-   * @param allocator
+   * @return Allocator
    */
-  void SetHostAllocator(Allocator*);
+  const Allocator& GetDeviceAllocator() const;
 
   /**
-   * @brief Get the const host Allocator object.
+   * @brief Get the const device-related Allocator object.
    *
    * @return Allocator
    */
   const Allocator& GetHostAllocator() const;
 
+  const Allocator& GetZeroAllocator() const;
+
+  /**
+   * @brief Allocate device memory for tensor.
+   */
+  void* Alloc(TensorBase*,
+              DataType dtype = DataType::UNDEFINED,
+              size_t requested_size = 0) const;
+
+  template <typename T>
+  T* Alloc(TensorBase* tensor, size_t requested_size = 0) const;
+
   /**
    * @brief Allocate host memory for tensor.
    */
-  void HostAlloc(pten::TensorBase*);
+  void* HostAlloc(TensorBase* tensor,
+                  DataType dtype = DataType::UNDEFINED,
+                  size_t requested_size = 0) const;
+
+  template <typename T>
+  T* HostAlloc(TensorBase* tensor, size_t requested_size = 0) const;
 
   // TODO(wilber): Just for the convenience of migrating the code, it will be
   // modified or removed later.
diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc
index 1dfcfa49347b50d305c2b37ccc4379eedb08a107..7578faf6143daa5cf4914505f413ff27aa6860e1 100644
--- a/paddle/pten/core/selected_rows.cc
+++ b/paddle/pten/core/selected_rows.cc
@@ -91,6 +91,12 @@ struct TensorFillVisitor {
   int64_t size_;
 };
 
+void* SelectedRows::AllocateFrom(Allocator* allocator,
+                                 DataType dtype,
+                                 size_t requested_size) {
+  return value_->AllocateFrom(allocator, dtype, requested_size);
+}
+
 bool SelectedRows::HasKey(int64_t key) const {
   return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
                                                                    : true;
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
index e12f59d02f2ba21054700248404640730614b277..2f224e42ea070e8888510c5e34813d3974327f7b 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/pten/core/selected_rows.h
@@ -113,6 +113,10 @@ class SelectedRows : public TensorBase,
            bool auto_grown = false,
            bool is_test = false);
 
+  void* AllocateFrom(Allocator* allocator,
+                     DataType dtype,
+                     size_t requested_size = 0) override;
+
   /*
    * @brief Get the index of the key from id_to_index_ map. If the key not
    * exist,
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 662553cbcb5986daae13c11cb43b2ecf36bc12c2..7a5e42da4908b2509ab3ce205650d7ec89d5d1f6 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/utils/type_registry.h"
@@ -61,6 +62,16 @@ class TensorBase {
   /// return Whether the storage is allocated.
   virtual bool initialized() const = 0;
 
+  // TODO(Aurelius84): This interface is under intermediate state now.
+  // We will remove DataType argument in the future. Please DO NOT
+  // rely on Datatype to much when design and implement other feature.
+
+  /// \brief Allocate memory with requested size from allocator.
+  /// \return The mutable data pointer value of type T.
+  virtual void* AllocateFrom(Allocator* allocator,
+                             DataType dtype,
+                             size_t requested_size = 0) = 0;
+
   /// \brief Return the type information of the derived class to support
   /// safely downcast in non-rtti environment.
   /// return The type information of the derived class.
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
index edb8f59e2677199dde7ca1a0ae7fed76e655e81f..24371ca7690de6ff45020499a9ca667e42934bae 100644
--- a/paddle/pten/kernels/cpu/cast_kernel.cc
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -36,7 +36,7 @@ void CastKernelImpl(const CPUContext& dev_ctx,
   auto numel = x.numel();
   auto* in_end = in_begin + numel;
 
-  auto* out_begin = out->mutable_data<OutT>(dev_ctx.GetPlace());
+  auto* out_begin = dev_ctx.Alloc<OutT>(out);
 
   paddle::platform::Transform<CPUContext> trans;
   trans(dev_ctx,
diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
index be5170f4d05aab459df45fc6c36e0f34511c22b0..0892e3974febd1bc4c8ac890cd123066b960e382 100644
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -37,7 +37,7 @@ void Copy(const Context& dev_ctx,
           << src_place;
 
   dst->Resize(src.dims());
-  auto* dst_ptr = dst->mutable_data(src_place);
+  auto* dst_ptr = dev_ctx.Alloc(dst);
 
   if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index e6ffd3b5000b3f8152d6d2f9840b5379408022e7..5cef8d0bdd56d08731d617f0bd9c732fe1688af5 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx,
                DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(dev_ctx.GetPlace());
+  T* z = dev_ctx.template Alloc<T>(out);
 
   // Loop over the total N elements of both operands while sum-reducing every
   // B pairs along the way where B is the dimension of the least ordered axis
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index 179a1881189222e18f2dde14c35c14caadc831f4..2d717414d70f5463fc12d6ec64774351d36bcc7e 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -45,10 +45,8 @@ struct SameDimsAddFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VADD(x.numel(),
-              x.data<T>(),
-              y.data<T>(),
-              z->mutable_data<T>(dev_ctx.GetPlace()));
+    blas.VADD(
+        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
   }
 };
 
@@ -61,7 +59,7 @@ struct SameDimsAddFunctor<
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* z) {
-    z->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(z);
     auto eigen_x = pten::EigenVector<T>::Flatten(x);
     auto eigen_y = pten::EigenVector<T>::Flatten(y);
     auto eigen_z = pten::EigenVector<T>::Flatten(*z);
@@ -89,10 +87,8 @@ struct SameDimsSubtractFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VSUB(x.numel(),
-              x.data<T>(),
-              y.data<T>(),
-              z->mutable_data<T>(dev_ctx.GetPlace()));
+    blas.VSUB(
+        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
   }
 };
 
@@ -147,10 +143,8 @@ struct SameDimsDivideFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VDIV(x.numel(),
-              x.data<T>(),
-              y.data<T>(),
-              z->mutable_data<T>(dev_ctx.GetPlace()));
+    blas.VDIV(
+        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
   }
 };
 
@@ -173,10 +167,8 @@ struct SameDimsMultiplyFunctor<
                   const DenseTensor& y,
                   DenseTensor* z) {
     auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VMUL(x.numel(),
-              x.data<T>(),
-              y.data<T>(),
-              z->mutable_data<T>(dev_ctx.GetPlace()));
+    blas.VMUL(
+        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
   }
 };
 
@@ -241,8 +233,8 @@ void CommonGradBroadcastCPU(const DenseTensor& x,
   const T* y_data = y.data<T>();
   const Tout* out_data = out.data<Tout>();
   const Tout* dout_data = dout.data<Tout>();
-  T* dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
-  T* dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
+  T* dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
+  T* dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
   if (dx_data != nullptr) {
     memset(dx_data, 0, dx->numel() * sizeof(T));
   }
@@ -292,7 +284,7 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
   PADDLE_ENFORCE_NOT_NULL(y_data,
                           paddle::platform::errors::InvalidArgument(
                               "The input Y should not be empty."));
-  OutType* out_data = z->mutable_data<OutType>(ctx.GetPlace());
+  OutType* out_data = ctx.Alloc<OutType>(z);
 
   const int out_size = std::accumulate(
       out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
@@ -373,7 +365,7 @@ void ElementwiseCompute(const CPUContext& dev_ctx,
                         int axis,
                         Functor func,
                         DenseTensor* z) {
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  dev_ctx.Alloc<OutType>(z);
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   bool is_xsize_larger = true;
@@ -677,32 +669,30 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
     return;
   }
   if (post == 1) {
-    ElemwiseGradBroadcast1CPU(
-        x.data<T>(),
-        y.data<T>(),
-        out.data<Tout>(),
-        dout.data<Tout>(),
-        pre,
-        n,
-        is_xsize_larger,
-        dx_op,
-        dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    ElemwiseGradBroadcast1CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
   } else {
-    ElemwiseGradBroadcast2CPU(
-        x.data<T>(),
-        y.data<T>(),
-        out.data<Tout>(),
-        dout.data<Tout>(),
-        pre,
-        n,
-        post,
-        is_xsize_larger,
-        dx_op,
-        dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    ElemwiseGradBroadcast2CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              post,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
   }
 }
 
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 6d76626605c5c7bd3ea39470e824c57cb2a6484d..d4987e7a3606987ab64449e1346c788431895788 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -37,7 +37,7 @@ namespace pten {
                        const DenseTensor& y,                                \
                        int axis,                                            \
                        DenseTensor* out) {                                  \
-    out->mutable_data<T>(dev_ctx.GetPlace());                               \
+    dev_ctx.template Alloc<T>(out);                                         \
     if (x.dims() == y.dims()) {                                             \
       SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
           dev_ctx, x, y, out);                                              \
@@ -85,7 +85,7 @@ void DivideRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
   // allocate memory for out
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
     SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
         dev_ctx, x, y, out);
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index 8f84bd0515b516e25821f8fa84d6935aa6260032..2b0659ac2e35746634fe01e6ee5263aabf4806a6 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -119,7 +119,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx,
   GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
 
   shuffled_input->ResizeAndAllocate(shuffled_dims);
-  shuffled_input->mutable_data<OutT>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<OutT>(shuffled_input);
 
   pten::math::TransposeNormal<DeviceContext, OutT> trans;
   trans(dev_ctx, input, shuffled_input, perm_axis);
@@ -158,7 +158,7 @@ void ReduceKernelImpl(const DeviceContext& dev_ctx,
                       const std::vector<int64_t>& dims,
                       bool keep_dim,
                       bool reduce_all) {
-  output->mutable_data<OutT>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<OutT>(output);
 
   if (reduce_all) {
     // Flatten and reduce 1-D tensor
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 774d3891b03726a940b6f31a4058e37f3c79277d..4f999ac4d17ecac4d9a9959b82a4681f1643c386 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -33,7 +33,7 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out) {
   // calc
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   auto eigen_out = pten::EigenVector<T>::Flatten(*out);
   auto eigen_x = pten::EigenVector<T>::Flatten(x);
   auto& dev = *dev_ctx.eigen_device();
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index 6ce4998287956c29140c8c3690661b2e92a6f450..ecb058d35b909bc9455b019e55ab8f2277fd587b 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -29,7 +29,7 @@ void EmptyKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) {
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 9ea27fd9c5b8d5f9b9a4d6fb0d6cb608d13f5984..3f1651eeb276f0828daed7bee213030bd80de72b 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -229,7 +229,7 @@ class TransformFunctor {
                    const bool is_xsize_larger = true)
       : x_(x.data<T>()),
         y_(y.data<T>()),
-        z_(z->mutable_data<OutType>(ctx.GetPlace())),
+        z_(ctx.template Alloc<OutType>(z)),
         nx_(x.numel()),
         ctx_(ctx),
         func_(func),
@@ -425,8 +425,8 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dout.data<Tout>(),
       dx_op,
       dy_op,
-      dx == nullptr ? nullptr : dx->mutable_data<T>(dev_ctx.GetPlace()),
-      dy == nullptr ? nullptr : dy->mutable_data<T>(dev_ctx.GetPlace())});
+      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
+      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
 inline void ElementwiseGradPreProcess(const DenseTensor &dout,
@@ -631,7 +631,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
     ins_data[i] = ins[i]->data<InT>();
   }
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (*outs)[i]->mutable_data<OutT>(ctx.GetPlace());
+    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
   }
 #ifdef PADDLE_WITH_XPU2
   int block_size = 64;
diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc
index 13cfaedb33d38ee2bb6052ea622fc59b659f581a..7d4dc3c7ce8f00fece82e5a27af5347b5d5cfabf 100644
--- a/paddle/pten/kernels/funcs/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -36,7 +36,7 @@ struct TransposeNormal<CPUContext, T> {
     auto in_stride = pten::framework::stride(in.dims());
     auto out_stride = pten::framework::stride(out->dims());
     const T* in_ptr = in.data<T>();
-    T* out_ptr = out->mutable_data<T>(dev_ctx.GetPlace());
+    T* out_ptr = dev_ctx.template Alloc<T>(out);
 
     auto transpose_helper = [&](int64_t beg, int64_t end) {
       for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index 24d72ca3d81ce455bcaee2d9d82261707674fb2c..a7b7184487c962af14a9a92510bca14103bcf2f5 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -61,7 +61,7 @@ struct TransposeNormal<GPUContext, T> {
     auto in_stride = pten::framework::stride(in.dims());
     auto out_stride = pten::framework::stride(out->dims());
     auto* in_ptr = in.data<T>();
-    auto* out_ptr = out->mutable_data<T>(dev_ctx.GetPlace());
+    T* out_ptr = dev_ctx.template Alloc<T>(out);
 
     // copy in_stride, out_stride, axis to gpu device
     const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace();
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 12f246c3238d067a49032f077d472609c570cb93..81d09ef164652f50cc004998990c3670e1eaed66 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -43,7 +43,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
-  out->mutable_data<OutT>(dev_ctx.GetPlace());
+  dev_ctx.Alloc<OutT>(out);
   pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
                                                    InT,
                                                    OutT>(
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 75aacc8d3d1179861526a68bc9a30cb27340adf8..24bd034fb15a0df3e19c60bfebb56f90c72da75a 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -29,7 +29,7 @@ void DotKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   if (1 == out->dims().size()) {
     auto eigen_out = pten::EigenScalar<T>::From(*out);
     auto eigen_x = pten::EigenVector<T>::Flatten(x);
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 9a3ae7f12dfcd62a1a18154971fa99ab72c5561d..6f744212cd5b6ed8aafd9e48313517e82b27c27c 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -352,7 +352,7 @@ void LaunchKernel(const KPDevice &ctx,
   pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (*outs)[i]->mutable_data<OutT>(ctx.GetPlace());
+    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
   }
 
   for (int i = 0; i < Arity; i++) {
@@ -1264,8 +1264,8 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   const T *y_data = y.data<T>();
   const Tout *out_data = out.data<Tout>();
   const Tout *dout_data = dout.data<Tout>();
-  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
-  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
+  T *dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
+  T *dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
 
   std::vector<int> x_one_indexs;
   std::vector<int> y_one_indexs;
@@ -1923,34 +1923,32 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
     return;
   }
   if (post == 1) {
-    ElemwiseGradBroadcast1CUDA(
-        ctx.stream(),
-        x.data<T>(),
-        y.data<T>(),
-        out.data<Tout>(),
-        dout.data<Tout>(),
-        pre,
-        n,
-        is_xsize_larger,
-        dx_op,
-        dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    ElemwiseGradBroadcast1CUDA(ctx.stream(),
+                               x.data<T>(),
+                               y.data<T>(),
+                               out.data<Tout>(),
+                               dout.data<Tout>(),
+                               pre,
+                               n,
+                               is_xsize_larger,
+                               dx_op,
+                               dy_op,
+                               dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                               dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
   } else {
-    ElemwiseGradBroadcast2CUDA(
-        ctx.stream(),
-        x.data<T>(),
-        y.data<T>(),
-        out.data<Tout>(),
-        dout.data<Tout>(),
-        pre,
-        n,
-        post,
-        is_xsize_larger,
-        dx_op,
-        dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    ElemwiseGradBroadcast2CUDA(ctx.stream(),
+                               x.data<T>(),
+                               y.data<T>(),
+                               out.data<Tout>(),
+                               dout.data<Tout>(),
+                               pre,
+                               n,
+                               post,
+                               is_xsize_larger,
+                               dx_op,
+                               dy_op,
+                               dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                               dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
   }
 }
 
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 996d85d3f42a7996e481a4887a0d0f4fa2587893..37104c46a49427828f75e09c5b8aa544051df530 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -47,7 +47,7 @@ namespace pten {
     inputs.emplace_back(&x);                                         \
     inputs.emplace_back(&y);                                         \
     outputs.emplace_back(out);                                       \
-    out->mutable_data<T>(dev_ctx.GetPlace());                        \
+    dev_ctx.template Alloc<T>(out);                                  \
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
         dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
   }
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index dd7c2f242ea4dbb473702c63716393566cd912c5..5aba001267a0a26bf587ff201a1c71462d094865 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
                                                    T,
                                                    T>(
diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
index aa878f7e9eb7f157ab193ff5129e5752aded67e6..7e4c4f0d66d4fc89634eb7bde9eb24e2743d4a7c 100644
--- a/paddle/pten/kernels/impl/complex_kernel_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -26,7 +26,7 @@ void ConjKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  auto* out_data = dev_ctx.template Alloc<T>(out);
 
   paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
   paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
index d0c6cf6793e6d37e3aad4a3a601280a9f02d0013..d4ea9fc944527145269fdfd1a854aca1299a6018 100644
--- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -73,7 +73,7 @@ struct DotGradFunction<DeviceContext,
       auto dout = EigenMatrix<T>::From(*tensor_dout);
 
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dx);
         auto y = EigenMatrix<T>::From(*tensor_y);
         auto& dev = *ctx.eigen_device();
         Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
@@ -85,7 +85,7 @@ struct DotGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dy);
         auto x = EigenMatrix<T>::From(*tensor_x);
         auto& dev = *ctx.eigen_device();
         Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
@@ -100,7 +100,7 @@ struct DotGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto* data_dx = ctx.template Alloc<T>(tensor_dx);
       const auto* data_y = tensor_y->data<T>();
       const DDim& dim = tensor_x->dims();
       size_t N = static_cast<size_t>(pten::framework::product(dim));
@@ -115,7 +115,7 @@ struct DotGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto* data_dy = ctx.template Alloc<T>(tensor_dy);
       const auto* data_x = tensor_x->data<T>();
       const DDim& dim = tensor_y->dims();
       size_t N = static_cast<size_t>(pten::framework::product(dim));
@@ -164,7 +164,7 @@ struct DotGradFunction<DeviceContext,
       auto dout = EigenMatrix<T>::From(*tensor_dout);
 
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dx);
         auto y = EigenMatrix<T>::From(*tensor_y);
         auto dx = EigenMatrix<T>::From(*tensor_dx);
         auto& dev = *ctx.eigen_device();
@@ -173,7 +173,7 @@ struct DotGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dy);
         auto x = EigenMatrix<T>::From(*tensor_x);
         auto dy = EigenMatrix<T>::From(*tensor_dy);
         auto& dev = *ctx.eigen_device();
@@ -189,7 +189,7 @@ struct DotGradFunction<DeviceContext,
     auto const B = d[d.size() - 1];
 
     if (tensor_dx) {
-      auto* dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto* dx = ctx.template Alloc<T>(tensor_dx);
       for (auto j = 0; j < N / B; ++j) {
         auto const ss = dz[j];
         for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
@@ -197,7 +197,7 @@ struct DotGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto* dy = ctx.template Alloc<T>(tensor_dy);
       for (auto j = 0; j < N / B; ++j) {
         auto const ss = dz[j];
         for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
@@ -272,7 +272,7 @@ struct DotDoubleGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto* data_dx = ctx.template Alloc<T>(tensor_dx);
       const auto* data_ddy = tensor_ddy->data<T>();
       const DDim& dim = tensor_dx->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -287,7 +287,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto* data_dy = ctx.template Alloc<T>(tensor_dy);
       const auto* data_ddx = tensor_ddx->data<T>();
       const DDim& dim = tensor_dy->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -302,7 +302,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
+      auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
       auto* data_x = tensor_x->data<T>();
       auto* data_y = tensor_y->data<T>();
       auto* data_ddx = tensor_ddx->data<T>();
@@ -351,7 +351,7 @@ struct DotDoubleGradFunction<DeviceContext,
       auto& dev = *ctx.eigen_device();
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dx);
         auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
         Eigen::DSizes<int, 1> size(tensor_ddy->numel());
         auto dx = EigenVector<T>::Flatten(*tensor_dx);
@@ -359,7 +359,7 @@ struct DotDoubleGradFunction<DeviceContext,
       }
 
       if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_dy);
         auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
         Eigen::DSizes<int, 1> size(tensor_ddx->numel());
 
@@ -368,7 +368,7 @@ struct DotDoubleGradFunction<DeviceContext,
       }
 
       if (tensor_ddout) {
-        tensor_ddout->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(tensor_ddout);
         auto x = EigenVector<T>::Flatten(*tensor_x);
         auto y = EigenVector<T>::Flatten(*tensor_y);
         auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
@@ -381,7 +381,7 @@ struct DotDoubleGradFunction<DeviceContext,
     const auto* data_dout = tensor_dout->data<T>();
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto* data_dx = ctx.template Alloc<T>(tensor_dx);
       const auto* data_ddy = tensor_ddy->data<T>();
       const DDim& dim = tensor_dx->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -396,7 +396,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto* data_dy = ctx.template Alloc<T>(tensor_dy);
       const auto* data_ddx = tensor_ddx->data<T>();
       const DDim& dim = tensor_dy->dims();
       size_t N = static_cast<size_t>(product(dim));
@@ -411,7 +411,7 @@ struct DotDoubleGradFunction<DeviceContext,
     }
 
     if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
+      auto* data_ddout = ctx.template Alloc<T>(tensor_ddout);
       auto* data_x = tensor_x->data<T>();
       auto* data_y = tensor_y->data<T>();
       auto* data_ddx = tensor_ddx->data<T>();
@@ -552,7 +552,7 @@ struct DotTripleGradFunction<DeviceContext,
     const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
 
     if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
       const auto* data_ddy = in_tensor_ddy->data<T>();
 
       const DDim& dim = out_tensor_d_x->dims();
@@ -567,7 +567,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
       const auto* data_ddx = in_tensor_ddx->data<T>();
 
       const DDim& dim = out_tensor_d_y->dims();
@@ -582,7 +582,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
       auto* data_ddx = in_tensor_ddx->data<T>();
       auto* data_ddy = in_tensor_ddy->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
@@ -613,7 +613,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dy = in_tensor_d_dy->data<T>();
       auto* data_y = in_tensor_y->data<T>();
@@ -633,7 +633,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
       auto* data_x = in_tensor_x->data<T>();
@@ -678,7 +678,7 @@ struct DotTripleGradFunction<DeviceContext,
       auto& dev = *ctx.eigen_device();
       auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
       if (out_tensor_d_x) {
-        out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(out_tensor_d_x);
         auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
         Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
         auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
@@ -686,7 +686,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_y) {
-        out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(out_tensor_d_y);
         auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
         Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
 
@@ -695,7 +695,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_dout) {
-        out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(out_tensor_d_dout);
         auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
         auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
         auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
@@ -705,7 +705,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_ddx) {
-        out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(out_tensor_d_ddx);
         auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
         auto y = EigenVector<T>::Flatten(*in_tensor_y);
         auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
@@ -717,7 +717,7 @@ struct DotTripleGradFunction<DeviceContext,
       }
 
       if (out_tensor_d_ddy) {
-        out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+        ctx.template Alloc<T>(out_tensor_d_ddy);
         auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
         auto x = EigenVector<T>::Flatten(*in_tensor_x);
         auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
@@ -732,7 +732,7 @@ struct DotTripleGradFunction<DeviceContext,
     const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
 
     if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_x = ctx.template Alloc<T>(out_tensor_d_x);
       const auto* data_ddy = in_tensor_ddy->data<T>();
 
       const DDim& dim = out_tensor_d_x->dims();
@@ -747,7 +747,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_y = ctx.template Alloc<T>(out_tensor_d_y);
       const auto* data_ddx = in_tensor_ddx->data<T>();
 
       const DDim& dim = out_tensor_d_y->dims();
@@ -762,7 +762,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_dout = ctx.template Alloc<T>(out_tensor_d_dout);
       auto* data_ddx = in_tensor_ddx->data<T>();
       auto* data_ddy = in_tensor_ddy->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
@@ -790,7 +790,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_ddx = ctx.template Alloc<T>(out_tensor_d_ddx);
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dy = in_tensor_d_dy->data<T>();
       auto* data_y = in_tensor_y->data<T>();
@@ -809,7 +809,7 @@ struct DotTripleGradFunction<DeviceContext,
     }
 
     if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+      auto* data_d_ddy = ctx.template Alloc<T>(out_tensor_d_ddy);
       auto* data_dout = in_tensor_dout->data<T>();
       auto* data_d_dx = in_tensor_d_dx->data<T>();
       auto* data_x = in_tensor_x->data<T>();
@@ -838,10 +838,10 @@ void DotGradKernel(const Context& dev_ctx,
                    DenseTensor* dx,
                    DenseTensor* dy) {
   if (dx) {
-    dx->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dx);
   }
   if (dy) {
-    dy->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dy);
   }
   DotGradFunction<Context, T>()(dev_ctx, &x, &y, &dout, dx, dy);
 }
@@ -857,13 +857,13 @@ void DotDoubleGradKernel(const Context& dev_ctx,
                          DenseTensor* dy,
                          DenseTensor* ddout) {
   if (dx) {
-    dx->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dx);
   }
   if (dy) {
-    dy->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dy);
   }
   if (ddout) {
-    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddout);
   }
   DotDoubleGradFunction<Context, T>()(
       dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout);
@@ -885,19 +885,19 @@ void DotTripleGradKernel(const Context& dev_ctx,
                          DenseTensor* d_ddy,
                          DenseTensor* d_dout) {
   if (d_x) {
-    d_x->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(d_x);
   }
   if (d_y) {
-    d_y->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(d_y);
   }
   if (d_ddx) {
-    d_ddx->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(d_ddx);
   }
   if (d_ddy) {
-    d_ddy->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(d_ddy);
   }
   if (d_dout) {
-    d_dout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(d_dout);
   }
 
   DotTripleGradFunction<Context, T>()(dev_ctx,
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 4fee23e175c9ec28a492a3ff2cf37ba5c1234b92..4fbe9f34e5b4d9e683db4f623fe6195f21469f8d 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -26,7 +26,7 @@ namespace pten {
 
 template <typename T, typename Context, typename VType>
 void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  tensor->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(tensor);
   auto t = pten::EigenVector<T>::Flatten(*tensor);
   t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(val));
 }
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index fbcb073150cc5ecea4252d755c6f85e677bdf120..87785a2b4778a8cbb3d54dd308ca812d158eb042 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -105,7 +105,7 @@ void MatMul(const Context& dev_ctx,
             bool trans_b,
             DenseTensor* out,
             bool flag = false) {
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   auto blas = paddle::operators::math::GetBlas<Context, T>(dev_ctx);
   auto mat_dim_a =
       paddle::operators::math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
@@ -123,7 +123,7 @@ void MatMul(const Context& dev_ctx,
               b.data<T>(),
               mat_dim_b,
               static_cast<T>(1),
-              out->data<T>(),
+              dev_ctx.template Alloc<T>(out),
               static_cast<T>(flag));
 }
 
@@ -242,8 +242,8 @@ void MatmulGradKernel(const Context& dev_ctx,
 
   // Case1 : x's or y's dim = 1
   if (x_ndim == 1 && y_ndim == 1) {
-    if (dx) dx->mutable_data<T>(dev_ctx.GetPlace());
-    if (dy) dy->mutable_data<T>(dev_ctx.GetPlace());
+    if (dx) dev_ctx.template Alloc<T>(dx);
+    if (dy) dev_ctx.template Alloc<T>(dy);
     if (out_grad.numel() == 1) {
       DotGradFunction<Context, T>()(dev_ctx, &x, &y, &out_grad, dx, dy);
       return;
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index e59a54c703ab543c5d27db5291b0b2cb9c6ee79b..858807a1d4d6496d5e3091aa71f5b2dada03b92e 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -118,7 +118,7 @@ void MatMulFunction(const Context& dev_ctx,
             N));
     VLOG(3) << "MatMul's case 1";
     Out->Resize({1});
-    Out->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(Out);
     blas.GEMM(CblasNoTrans,
               CblasTrans,
               1,
@@ -128,7 +128,7 @@ void MatMulFunction(const Context& dev_ctx,
               y_data,
               x_data,
               static_cast<T>(flag),
-              Out->data<T>());
+              dev_ctx.template Alloc<T>(Out));
     return;
   }
 
@@ -165,7 +165,7 @@ void MatMulFunction(const Context& dev_ctx,
       out_dims.back() = y_dims.back();
     }
     Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
-    Out->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(Out);
     if (trans_y) {
       const int M = Y.numel() / N;
       VLOG(3) << "MatMul's case 2";
@@ -176,7 +176,7 @@ void MatMulFunction(const Context& dev_ctx,
                 y_data,
                 x_data,
                 static_cast<T>(flag),
-                Out->data<T>());
+                dev_ctx.template Alloc<T>(Out));
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y.numel() / (M * N);
@@ -189,7 +189,7 @@ void MatMulFunction(const Context& dev_ctx,
                   y_data,
                   x_data,
                   static_cast<T>(flag),
-                  Out->data<T>());
+                  dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 4";
         blas.BatchedGEMM(CblasTrans,
@@ -201,7 +201,7 @@ void MatMulFunction(const Context& dev_ctx,
                          y_data,
                          x_data,
                          static_cast<T>(flag),
-                         Out->data<T>(),
+                         dev_ctx.template Alloc<T>(Out),
                          batch_size,
                          M * N,
                          0);
@@ -243,7 +243,7 @@ void MatMulFunction(const Context& dev_ctx,
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
     Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
-    Out->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(Out);
 
     if (trans_x) {
       const int M = x_dims[x_ndim - 1];
@@ -257,7 +257,7 @@ void MatMulFunction(const Context& dev_ctx,
                   x_data,
                   y_data,
                   static_cast<T>(flag),
-                  Out->data<T>());
+                  dev_ctx.template Alloc<T>(Out));
       } else {
         VLOG(3) << "MatMul's case 6";
         blas.BatchedGEMM(CblasTrans,
@@ -269,7 +269,7 @@ void MatMulFunction(const Context& dev_ctx,
                          x_data,
                          y_data,
                          static_cast<T>(flag),
-                         Out->data<T>(),
+                         dev_ctx.template Alloc<T>(Out),
                          batch_size,
                          M * N,
                          0);
@@ -284,7 +284,7 @@ void MatMulFunction(const Context& dev_ctx,
                 x_data,
                 y_data,
                 static_cast<T>(flag),
-                Out->data<T>());
+                dev_ctx.template Alloc<T>(Out));
     }
     return;
   }
@@ -331,7 +331,7 @@ void MatMulFunction(const Context& dev_ctx,
   out_broadcast_dims[ndim - 1] = N;
 
   Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims));
-  Out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(Out);
 
   const int batch_dim = ndim - 2;
   // broadcast message
@@ -367,7 +367,7 @@ void MatMulFunction(const Context& dev_ctx,
               x_data,
               y_data,
               static_cast<T>(flag),
-              Out->data<T>());
+              dev_ctx.template Alloc<T>(Out));
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
@@ -378,7 +378,7 @@ void MatMulFunction(const Context& dev_ctx,
                 y_data,
                 x_data,
                 static_cast<T>(flag),
-                Out->data<T>());
+                dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 10";
       blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
@@ -390,7 +390,7 @@ void MatMulFunction(const Context& dev_ctx,
                        x_data,
                        y_data,
                        static_cast<T>(flag),
-                       Out->data<T>(),
+                       dev_ctx.template Alloc<T>(Out),
                        out_batch_size,
                        0,
                        K * N);
@@ -407,7 +407,7 @@ void MatMulFunction(const Context& dev_ctx,
                 x_data,
                 y_data,
                 static_cast<T>(flag),
-                Out->data<T>());
+                dev_ctx.template Alloc<T>(Out));
     } else {
       VLOG(3) << "MatMul's case 12";
       blas.BatchedGEMM(CblasTrans,
@@ -419,7 +419,7 @@ void MatMulFunction(const Context& dev_ctx,
                        x_data,
                        y_data,
                        static_cast<T>(flag),
-                       Out->data<T>(),
+                       dev_ctx.template Alloc<T>(Out),
                        out_batch_size,
                        M * K,
                        0);
@@ -435,7 +435,7 @@ void MatMulFunction(const Context& dev_ctx,
                      x_data,
                      y_data,
                      static_cast<T>(flag),
-                     Out->data<T>(),
+                     dev_ctx.template Alloc<T>(Out),
                      out_batch_size,
                      M * K,
                      K * N);
@@ -454,7 +454,7 @@ void MatMulFunction(const Context& dev_ctx,
 
       x_ptr[i] = x_data + x_index * M * K;
       y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = Out->data<T>() + i * M * N;
+      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
       IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
     }
     VLOG(3) << "MatMul's case 14";
diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h
index 54c1464c9e0221d5cc17c0db29fd7c2ce5ebf0f1..87efacccc97c5449ae54f44925b283f0a4a20ba6 100644
--- a/paddle/pten/kernels/impl/sign_kernel_impl.h
+++ b/paddle/pten/kernels/impl/sign_kernel_impl.h
@@ -26,7 +26,7 @@ template <typename T, typename Context>
 void SignKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
   auto eigen_out = pten::EigenVector<T>::Flatten(*out);
   auto eigen_x = pten::EigenVector<T>::Flatten(x);
 
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index 4b706e9e685b47af20dd23ab0855db6116623c46..a76dfb09a0ea4ad4f531c323e015c8cdccdd165d 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -32,7 +32,7 @@ void ReshapeKernel(const Context& dev_ctx,
     return;
   }
   out->set_meta(out_meta);
-  out->mutable_data(dev_ctx.GetPlace());
+  dev_ctx.Alloc(out);
   pten::Copy(dev_ctx, x, false, out);
   out->Resize(out_meta.dims);
   out->ResetLoD(x.lod());
diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
index 56b79061f75f680cfc82b54d18733769b50b07b3..56ad19f0cc3dde250d10614165b52c447fa2b744 100644
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -30,7 +30,7 @@ void Copy(const Context& dev_ctx,
           bool blocking,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
-  auto* dst_ptr = dst->mutable_data(dev_ctx.GetPlace());
+  auto* dst_ptr = dev_ctx.Alloc(dst);
   const auto& src_place = src.place();
   const auto& dst_place = dst->place();
 
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index c9d376b81a630c86a976f991d4edf693312f72ba..33d27ca5b1c9a91c577252e7f1596f4c163692d3 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -48,6 +49,11 @@ TEST(DEV_API, cast) {
   }
 
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+
   pten::DataType out_dtype = pten::DataType::FLOAT64;
   // 2. test API
   auto out = pten::Cast<float>(dev_ctx, dense_x, out_dtype);
diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc
index 6f9ea1b0d990ae9e4d789bc4c37fb104c730fe82..eb546e992e953840fc43daf0e40178f06abe108b 100644
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/concat_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -56,6 +57,10 @@ TEST(DEV_API, concat) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 6714b57105bd24ca292e184d1ff90cf7d82e1b92..e43769dfb2b01116796e3d289f9d0d2dd9f772b0 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/complex_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -44,6 +45,10 @@ TEST(DEV_API, conj) {
   }
 
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
 
   // 2. test API
   auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 01dfa925d6c5a1a59f0ae28cd1b28127221ec950..29f68513fa77e71993ba37d5ca94ccda67b1dcd5 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/copy_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 
@@ -57,6 +58,10 @@ TEST(DEV_API, copy) {
   std::cout << typeid(a).name() << std::endl;
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 17416d33473d07b227cad38f74bce9c47dd8d520..8b37c41d0b55b2b6bd6375301563fcb46790f783 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/full_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -32,6 +33,10 @@ using DDim = pten::framework::DDim;
 TEST(DEV_API, empty) {
   // 1. create input
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
 
   // 2. test API
   auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
@@ -58,6 +63,10 @@ TEST(DEV_API, empty_like) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::EmptyLike<float>(dev_ctx, dense_x);
 
   // 3. check result
@@ -74,6 +83,10 @@ TEST(DEV_API, full) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);
 
   // 3. check result
@@ -103,6 +116,10 @@ TEST(DEV_API, full_like) {
   float val = 1.0;
 
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
 
   // 2. test API
   auto out = pten::FullLike<float>(dev_ctx, dense_x, val);
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 27fecd3fcd9e38cdf48d96cc83f5d26705adc906..c1f7d6aaba39b91e0820c88850a223c6221f60d4 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/dot_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -57,6 +58,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index b3948843ee86c56987233bd5238edc3611a0fe9e..9d4c86f02679db03da14e26e52d65e37fdad21ab 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/math_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -59,6 +60,10 @@ TEST(DEV_API, add) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -107,6 +112,10 @@ TEST(DEV_API, subtract) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -155,6 +164,10 @@ TEST(DEV_API, divide) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -203,6 +216,10 @@ TEST(DEV_API, multiply) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index fc463d1ff1e1cdaeee1641a2c88f621b0a12c4de..2ebf10916becc257a4352c454248a1e659a26309 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -55,6 +56,10 @@ TEST(DEV_API, flatten) {
   }
   int start_axis = 1, stop_axis = 2;
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
 
   // 2. test API
   auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 40419ecb3ad936d78eef9bfd7b0c6d0aff93d64c..87c91b10081b985d85d079f144c257185329715a 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/matmul_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -54,6 +55,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 786492d3a1b1bdf462fa82f76d919cbc4d47a623..3abf54d26af31ccd315eb63b231202fba83ea1fa 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/math_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -47,6 +48,10 @@ TEST(DEV_API, mean) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index ac2bb60cf9fe6b97e7d8dbb8e9204aa2c08335f9..fe9b09c25557c30b8995f3d8ffd8be03e0fe2cdb 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/reshape_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -47,6 +48,10 @@ TEST(DEV_API, reshape) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index abb592cde3ff4276f9b0dbce3afb9d912a2e0f9f..80f12950094b794485e577e1e6920e00b136b55a 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/scale_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -47,6 +48,10 @@ TEST(DEV_API, scale) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
@@ -85,6 +90,10 @@ TEST(DEV_API, scale_host) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 595f0b96920ae24b2daadeca8e749d0232627720..9b48d8908ff2306fa9466874d0edcde8864e9379 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/math_kernel.h"
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
-
 namespace pten {
 namespace tests {
 
@@ -46,6 +46,10 @@ TEST(DEV_API, sum) {
 
   std::vector<int64_t> axis = {0, 1};
   pten::CPUContext dev_ctx;
+  dev_ctx.SetDeviceAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   // 2. test API
   auto out =
       pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 48f95472c7ec71c4ed71bd80f2d5430b04636813..1e856a0fe900fca423333f2d859af40db49e8f24 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
 import sys
 import unittest
 import paddle