diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index e2a62273d03282809a4e795d98aaa2a0a9250536..3b06722ddfbe013fbda9eb9d046f2692808ebb5e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -50,7 +50,7 @@ math_library(tree2col DEPS math_function)
 cc_test(
   selected_rows_functor_test
   SRCS selected_rows_functor_test.cc
-  DEPS selected_rows_functor)
+  DEPS allocator selected_rows_functor)
 cc_test(
   im2col_test
   SRCS im2col_test.cc
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 49c6942531defd914a4c827ba74866f8edb332a9..a2c88c723fefa6d95b616a49f337a94e293016f4 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -15,11 +15,15 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -109,6 +113,9 @@ TEST(selected_rows_functor, cpu_add) {
 TEST(selected_rows_functor, cpu_add_to) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -198,6 +205,9 @@ TEST(selected_rows_functor, cpu_add_to) {
 TEST(selected_rows_functor, cpu_merge_average_float) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -233,6 +243,9 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
 TEST(selected_rows_functor, cpu_merge_add_float) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -269,6 +282,9 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 TEST(selected_rows_functor, cpu_merge_add_int) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, int> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
@@ -305,6 +321,9 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 TEST(selected_rows_functor, cpu_merge_add_multi) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> set_const;
 
   int64_t height = 10;
@@ -354,6 +373,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> set_const;
 
   int64_t height = 10;
@@ -409,6 +431,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
   phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a09f438c505e80211aedc951921a4268c22d1b47..539bbfb87d0aa0623c1dca32d0144b3e41b25f4a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -163,7 +163,7 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
     if (!disable_setting_default_stream_for_allocator) {
       instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
     }
-    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
+    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
     dev_ctx->SetPinnedAllocator(
         instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());
 
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index d46f9250eeb4c90a0dbc2352e34face68ad06513..a18e695cce4d8d6d0cbe1b5620d2f096a53a0b13 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -148,7 +148,7 @@ struct DeviceContext::Impl {
     if (tensor->initialized() && tensor->place() != place) {
       ClearHolder(tensor);
     }
-    auto* allocator = tensor->numel() == 0
+    auto* allocator = tensor->numel() == 0 && requested_size == 0
                           ? zero_allocator_
                           : (pinned ? pinned_allocator_ : device_allocator_);
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 96e02f4c42046a72828d251465545654c3553b4a..1075cb9f777c38573498a1134124ab6dbcd32fec 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -44,7 +44,7 @@ void ConcatKernel(const Context& dev_ctx,
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
   out->Resize(out_dims);
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
 
   // If axis is 0, the lod of the output is not the same as inputs.
   if (axis == 0 && x[0]->lod().size() > 0) {
diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h
index 92587566eb87591c5c35572fcba8a39af8445f5a..05c02f167b6a2db6674152b7a1a110e19ca4e247 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad.h
+++ b/paddle/phi/kernels/cpu/elementwise_grad.h
@@ -90,13 +90,11 @@ ElementwiseAddGrad(const CPUContext& ctx,
                    int axis = -1) {
   auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
   if (dx) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
+    blas.VCOPY(dout.numel(), dout.data<T>(), ctx.template Alloc<T>(dx));
   }
 
   if (dy) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
+    blas.VCOPY(dout.numel(), dout.data<T>(), ctx.template Alloc<T>(dy));
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index d9c41508efde08f80454a8586163bbc06dbfc984..4c04566b8b0b0789463b4bdd6247b2914559192a 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -34,7 +34,7 @@ void HistogramKernel(const Context& dev_ctx,
   const T* input_data = input.data<T>();
   auto input_numel = input.numel();
 
-  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(output);
   phi::funcs::SetConstant<Context, int64_t>()(
       dev_ctx, output, static_cast<int64_t>(0));
 
diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
index bbb08f061677690e9c2dd8dbee598a7bbbf3982e..f615fb2e0bc3fdeb7b20b206c60fc2c0d7abbdd1 100644
--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -27,7 +27,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             DenseTensor* x_grad) {
   auto* mask_data = mask.data<bool>();
   auto* input_data = out_grad.data<T>();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto* out_data = dev_ctx.template Alloc<T>(x_grad);
   int mask_size = mask.numel();
 
   int index = 0;
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index f377658d507f6086101e1cdb0f0ab1891536e771..33311c26cfeb640863142239d8b8a0afcd2b0f3f 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -48,7 +48,8 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
   DDim out_dim{out_size};
   out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(phi::CPUPlace());
+
+  auto out_data = dev_ctx.template HostAlloc<T>(out);
 
   int index = 0;
   for (int i = 0; i < mask_size; i++) {
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
index ca57c223beb4be54805e6435a2dccea4caceb50d..969c5b9fe330643e157be7cd90b5f5a573b5615c 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -58,7 +58,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
 
   if (value_grad) {
     value_grad->Resize(index.dims());
-    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
       paddle::operators::cpu_gather_kernel<T, int32_t>(
           out_grad, axis, index, *value_grad, dev_ctx);
diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
index 9e6a0e441223f4df80de49aa7f4658e87295fac3..3dcd3c9eb49fb7fbb1908ec8beaf6bf5b01f8952 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
@@ -114,8 +114,9 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
       (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                         : phi::make_ddim({nt, h, w, c}));
   const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  input_grad->Resize(in_grad_dims);
+
+  T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
 
   if (data_layout == DataLayout::kNCHW) {
     TemporalShiftBwNCHW<T>(
diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
index 3d10520ec84f5a5c934d019a987dc25c4ab6a605..3edd3aa301f9855a4e2667cd4a4092fe271f4dba 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
@@ -114,7 +114,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
       (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                         : phi::make_ddim({nt, h, w, c}));
   const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  output->Resize(out_dims);
+  T* output_data = dev_ctx.template Alloc<T>(output);
 
   if (data_layout == DataLayout::kNCHW) {
     TemporalShiftFwNCHW<T>(
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
index 6b882ad28951297810e210321e03ab3cb837ea0c..0c04c78214a352c21db90163f70b4ba43d375a4a 100644
--- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -51,16 +51,19 @@ void YoloBoxKernel(const Context& dev_ctx,
   const int an_stride = (class_num + 5) * stride;
 
   DenseTensor anchors_;
-  auto anchors_data =
-      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  anchors_.Resize({an_num * 2});
+  auto anchors_data = dev_ctx.template Alloc<int>(&anchors_);
   std::copy(anchors.begin(), anchors.end(), anchors_data);
 
   const T* input_data = input->data<T>();
   const int* imgsize_data = imgsize->data<int>();
-  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  boxes->Resize({n, box_num, 4});
+  T* boxes_data = dev_ctx.template Alloc<T>(boxes);
   memset(boxes_data, 0, boxes->numel() * sizeof(T));
-  T* scores_data =
-      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+
+  scores->Resize({n, box_num, class_num});
+  T* scores_data = dev_ctx.template Alloc<T>(scores);
+
   memset(scores_data, 0, scores->numel() * sizeof(T));
 
   T box[4];
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e19735e6c13550e045af58a359ff5ba339a53fb3..d2c30c8fa361146717674f4e80f86462aa4459a2 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -996,7 +996,7 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
                         DenseTensor *z) {
   std::vector<const DenseTensor *> ins = {&x, &y};
   std::vector<DenseTensor *> outs = {z};
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<OutType>(z);
   BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
       dev_ctx, ins, &outs, axis, func);
 }
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 81525cb25449ee5a48ee36f8fd472e754d32a8a9..92f970aed327951e0d1833726bb1f32e8332e4a4 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -104,7 +104,8 @@ DenseTensor Diagonal(const DeviceContext& context,
     DenseTensor diag;
     DDim diag_dims = phi::make_ddim(ret_dims);
     auto dig_stride = phi::stride(diag_dims);
-    auto diag_data = diag.mutable_data<T>(diag_dims, context.GetPlace());
+    diag.Resize(diag_dims);
+    auto diag_data = context.template Alloc<T>(&diag);
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 17b0a653cc8a850498b4623094602f0c82c8dbf2..ffb3ff4ae334aadaed16bb1b3511cc5f3ed818cf 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -474,7 +474,7 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
   } else {
     auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = phi::Empty(dev_ctx, std::move(meta));
-    ddx_safe->mutable_data(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddx_safe);
     SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 65f21e5b7f196ccab6e3d65086ca4015615b6e98..b9ffb4e3f123782d334bdd258811dfaa7b52e8cd 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -237,7 +237,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
   // result.
   if (dx && dx->IsSharedBufferWith(dout)) {
     dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+    dx->Resize(x_dims);
+    ctx.template Alloc<T>(dx);
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
@@ -1680,7 +1681,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   // result.
   if (dx && dx->IsSharedBufferWith(dout)) {
     dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+    dx->Resize(x_dims);
+    ctx.template Alloc<T>(dx);
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index f428746bc524d704d0e889cbc5e879f89fbbad77..31212a687fa73aab1ab8c175e386d1732df6de82 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -39,8 +39,11 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
     const int NN = N + 4;
     const int KK = K + 4;
     phi::DenseTensor X1;
-    T* X1_data = X1.mutable_data<T>({M * KK}, paddle::platform::CPUPlace());
-    Y1_data = Y1.mutable_data<T>({M * (N + 4)}, paddle::platform::CPUPlace());
+    X1.Resize({M * KK});
+    T* X1_data = context.template HostAlloc<T>(&X1);
+
+    Y1.Resize({M * (N + 4)});
+    Y1_data = context.template HostAlloc<T>(&Y1);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index db4cdc57e2f04d54e4ac3122910be492fe511c33..a0e59f8f3fe23c4c43cc015152adc5a721ba2b60 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -319,7 +319,9 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
                         size,
                         vector->numel()));
   phi::DenseTensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  one.Resize({in_dims[0]});
+  context.template Alloc<double>(&one);
+
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
   phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
@@ -355,7 +357,9 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
                         in_dims[0],
                         vector->numel()));
   phi::DenseTensor one;
-  one.mutable_data<double>({size}, context.GetPlace());
+  one.Resize({size});
+  context.template Alloc<double>(&one);
+
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
   phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index b59a249bbbf04685f3f3c495bcf567fe34ab8c56..2011523a0153d7bfd84ffb2e0fa4d719dfabe614 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -117,7 +117,7 @@ class ColwiseSum<phi::CPUContext, T> {
             size,
             out->numel()));
 
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -185,7 +185,7 @@ class RowwiseMean<phi::CPUContext, T> {
             height,
             out->numel()));
     auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -251,7 +251,7 @@ class RowwiseSum<phi::CPUContext, T> {
             height,
             out->numel()));
 
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 4fb1bc13ae7f82ad5d77d12981f76a773804735f..c5ddce68e7e2d64d57c325459dd9040c811d91d3 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -451,7 +451,7 @@ void SelectKernel(const KPDevice &dev_ctx,
     out_dim.push_back(static_cast<int64_t>(rank));
     out->Resize(phi::make_ddim(out_dim));
   }
-  auto out_data = out->mutable_data<OutT>(cuda_place);
+  auto out_data = dev_ctx.template Alloc<OutT>(out);
   // 3.2 get true data's index according to cond_data and cumsum_data
   if (total_true_num <= 0) return;
   SelectKernel<MT, InT, CT, OutT, Functor, kVecSize, SelectData>
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc
index de362d45a8ba745ad89145b910e74311fe27fe14..fb087660612ec5e03397aaad2a7aaa360b6b6b33 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -542,11 +542,10 @@ struct MergeAddImpl {
     }
 
     out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
-    auto* out_data = out.mutable_value()->data<T>();
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(phi::make_ddim(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    auto* out_data = context.template Alloc<T>(out_tensor);
 
     if (merged_row_set.size() == row_num && !sorted_result) {
       // no duplicated ids, just concat the result together
@@ -659,9 +658,10 @@ struct MergeAdd<phi::XPUContext, T> {
 
     out.set_rows(merge_rows);
     out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -748,12 +748,13 @@ struct MergeAdd<phi::XPUContext, T> {
 
     out.set_rows(merge_rows);
     out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
 
-    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(phi::make_ddim(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
+
+    float* y_data = reinterpret_cast<float*>(out_tensor->data<T>());
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -856,11 +857,11 @@ struct MergeAverage<phi::CPUContext, T> {
     }
 
     out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
-    auto* out_data = out.mutable_value()->data<T>();
+
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(phi::make_ddim(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    auto* out_data = context.template Alloc<T>(out_tensor);
 
     std::vector<int64_t> merge_rows(merged_row_set.begin(),
                                     merged_row_set.end());
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index e08fea2b353176e66ae873f2e79c2fcaf345c431..8f409466e19b192ad7f8d8624beeed701b9bf881 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -392,9 +392,10 @@ struct MergeAddImpl {
 
     out.set_rows(merge_rows);
     out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
 
     phi::funcs::SetConstant<DeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
@@ -462,9 +463,11 @@ struct MergeAddImpl {
 
     out.set_rows(merge_rows);
     out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
+
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
 
     phi::funcs::SetConstant<DeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index 6c48e05c76cf79b0eb96ef7af09a1e509548fd68..f04c7a8da8be1e638aeaa73f8a795e2ea0ecae19 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -941,8 +941,7 @@ bool SortTopk(const phi::GPUContext& ctx,
   const std::vector<int64_t> dims = {num_rows, num_cols};
   auto dim = phi::make_ddim(dims);
   input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  ctx.template Alloc<int64_t>(&input_indices);
   size_t temp_storage_bytes = -1;
 
   auto ComputeBlockSize = [](int col) {
@@ -984,7 +983,7 @@ bool SortTopk(const phi::GPUContext& ctx,
 
   const T* input = input_tensor->data<T>();
   T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+  int64_t* indices = ctx.template Alloc<int64_t>(indices_tensor);
 
   if (k == num_cols) {
     // Doing a full sort.
@@ -993,8 +992,8 @@ bool SortTopk(const phi::GPUContext& ctx,
   } else {
     temp_values.Resize(dim);
     temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+    sorted_values_ptr = ctx.template Alloc<T>(&temp_values);
+    sorted_indices_ptr = ctx.template Alloc<int64_t>(&temp_indices);
   }
 
   // Get temp storage buffer size, maybe can allocate a fixed buffer to save
@@ -1067,7 +1066,7 @@ bool SortTopk(const phi::GPUContext& ctx,
 #endif
   }
   Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+  ctx.template Alloc<uint8_t>(&temp_storage, temp_storage_bytes);
 
   if (largest) {
     auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 0666c60a8d0c12ef041a3e3fb973baad5573e38b..80ff71b2158241370cde44dc581e7b3f388877fa 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx,
 
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
   out->Resize(out_dims);
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
 
   // If axis is 0, the lod of the output is not the same as inputs.
   if (axis == 0 && x[0]->lod().size() > 0) {
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 9ed881350415061d73dbecde3d143af3dde6af0a..879056d67a7dad49d83ee37772c6230b6a165b24 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1231,7 +1231,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
 
     const T* input_data = input.data<T>();
     const T* filter_data = filter.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
 
     phi::DenseTensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
@@ -1240,7 +1240,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                                        filter.dims()[0],
                                        filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
-      filter_hwc.mutable_data<T>(context.GetPlace());
+      context.template Alloc<T>(&filter_hwc);
       std::vector<int> perm_axis({2, 3, 0, 1});
       phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
@@ -1409,7 +1409,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
     const T* input_data = input.data<T>();
     const T* filter_data = filter.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
 
     phi::DenseTensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
@@ -1418,7 +1418,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                                        filter.dims()[0],
                                        filter.dims()[1]});
       filter_hwc.Resize(filter_hwc_dims);
-      filter_hwc.mutable_data<T>(context.GetPlace());
+      context.template Alloc<T>(&filter_hwc);
       std::vector<int> perm_axis({2, 3, 0, 1});
       phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
@@ -1584,7 +1584,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
 
     const T* input_data = input.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
+    T* filter_grad_data = context.template Alloc<T>(filter_grad);
 
     int block_size = 512;
     int blocks;
@@ -1654,7 +1654,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
                                               filter_grad->dims()[0],          \
                                               filter_grad->dims()[1]});        \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
-        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
+        context.template Alloc<T>(&filter_grad_hwc);                           \
         phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
index 2e815b3e455d5724c5e82df6ff57e8dae8b2741f..5bb0a4946f17bf30692d6d1cf76d09d10602127a 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -75,7 +75,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
   phi::funcs::SetConstant<Context, T> set_zero;
 
   if (input_grad) {
-    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(input_grad);
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
     if (fuse_relu) {
@@ -106,7 +106,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
   }
 
   if (filter_grad) {
-    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(filter_grad);
     set_zero(dev_ctx, filter_grad, static_cast<T>(0));
     if (fuse_relu) {
       paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
index 8617a42e4e544a4d7c4553b7366f0bb57dbfb1f0..1cb6301dc99acc26fcda2e709210e9ef613b4d13 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -32,7 +32,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                          const std::string& data_format,
                          DenseTensor* out) {
   DenseTensor* output = out;
-  output->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(output);
 
   const std::vector<int> strides = strides_t;
   std::vector<int> dilations = dilations_t;
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
index 4aa59cded8f379815b6eaed32b2e6de85a48bdc3..cdb8d0bd277622a4e116e055216daea327125917 100644
--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -29,7 +29,7 @@ void DropoutGradRawKernel(const Context& dev_ctx,
                           const std::string& mode,
                           DenseTensor* x_grad) {
   bool upscale_in_train = (mode == "upscale_in_train");
-  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(x_grad);
   paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
                                                    is_test,
                                                    p.to<float>(),
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index e8f01be89737045efdef5c6cbce3a57e9f9acee1..84047f14739b556342dc33a6446a9eaa52240111 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -153,7 +153,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
 
   // dx
   if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto *dx_data = ctx.template Alloc<T>(dx);
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
         phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
@@ -163,7 +163,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       // the result of dy wrong.
       if (dx->IsSharedBufferWith(dout)) {
         dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+        dx->Resize(x.dims());
+        ctx.template Alloc<T>(dx);
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
@@ -173,7 +174,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
   }
   // dy
   if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    auto *dy_data = ctx.template Alloc<T>(dy);
     if (dy->dims() == dout.dims()) {
       if (dy_data != dout_data) {
         phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
@@ -217,12 +218,11 @@ void ElementwiseAddGrad(const GPUContext &ctx,
                  PREDEFINED_BLOCK_SIZE,
              1);
     SimpleElemwiseAddGradCUDAKernel<T>
-        <<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(),
-            size,
-            vec_size,
-            dx->mutable_data<T>(ctx.GetPlace()),
-            dy->mutable_data<T>(ctx.GetPlace()));
+        <<<grid_size, block_size, 0, ctx.stream()>>>(dout.data<T>(),
+                                                     size,
+                                                     vec_size,
+                                                     ctx.template Alloc<T>(dx),
+                                                     ctx.template Alloc<T>(dy));
   } else {
     VLOG(4) << "Special case when dy_data is the same as dout_data, "
                "and dx_data is the same as dout_data, do not need "
@@ -264,7 +264,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
   auto *dout_data = dout.data<T>();
   // dx
   if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto *dx_data = ctx.template Alloc<T>(dx);
     if (dx->dims() == dout.dims()) {
       if (dx_data != dout_data) {
         phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
@@ -274,7 +274,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       // the result of dy wrong.
       if (dx->IsSharedBufferWith(dout)) {
         dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+        dx->Resize(x.dims());
+        ctx.template Alloc<T>(dx);
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
@@ -284,7 +285,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
   }
   // dy
   if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    auto *dy_data = ctx.template Alloc<T>(dy);
     if (dy->dims() == dout.dims()) {
       if (dy_data != dout_data) {
         dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
@@ -293,10 +294,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
             dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
         SimpleElemwiseSubGradCUDAKernel<T>
             <<<grid_size, block_size, 0, ctx.stream()>>>(
-                dout.data<T>(),
-                size,
-                nullptr,
-                dy->mutable_data<T>(ctx.GetPlace()));
+                dout.data<T>(), size, nullptr, ctx.template Alloc<T>(dy));
       }
     } else {
       std::vector<int> reduce_dims =
@@ -320,11 +318,10 @@ void elementwise_sub_grad(const GPUContext &ctx,
   dim3 grid_size =
       dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
   SimpleElemwiseSubGradCUDAKernel<T>
-      <<<grid_size, block_size, 0, ctx.stream()>>>(
-          dout.data<T>(),
-          size,
-          dx->mutable_data<T>(ctx.GetPlace()),
-          dy->mutable_data<T>(ctx.GetPlace()));
+      <<<grid_size, block_size, 0, ctx.stream()>>>(dout.data<T>(),
+                                                   size,
+                                                   ctx.template Alloc<T>(dx),
+                                                   ctx.template Alloc<T>(dy));
 }
 /*
 ******************************
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 046c210e186013692b89183d75b6c6461c084848..10a5eec5b1ecf64e4170327188f2ae38cd99e13e 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -314,9 +314,9 @@ void GraphReindexKernel(const Context& dev_ctx,
     const auto* ph_index = hashtable_index.get_ptr();
     hashtable_index_out.ShareDataWith(*ph_index);
     int* hashtable_value_data =
-        hashtable_value_out.mutable_data<int>(dev_ctx.GetPlace());
+        dev_ctx.template Alloc<int>(&hashtable_value_out);
     int* hashtable_index_data =
-        hashtable_index_out.mutable_data<int>(dev_ctx.GetPlace());
+        dev_ctx.template Alloc<int>(&hashtable_index_out);
     BufferReindex<T, Context>(dev_ctx,
                               x_data,
                               src_outputs,
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 02f5bbb530a6c5839f2b864bd9dd8f276a9cb7c1..47929e640d57a25c387e3d30eea2b74e69377a3a 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -85,7 +85,7 @@ void HistogramKernel(const Context& dev_ctx,
   const T* input_data = input.data<T>();
   const int input_numel = input.numel();
 
-  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(output);
   phi::funcs::SetConstant<Context, int64_t>()(
       dev_ctx, output, static_cast<int64_t>(0));
 
@@ -98,8 +98,10 @@ void HistogramKernel(const Context& dev_ctx,
     auto input_x = phi::EigenVector<T>::Flatten(input);
 
     DenseTensor input_min_t, input_max_t;
-    auto* input_min_data = input_min_t.mutable_data<T>({1}, dev_ctx.GetPlace());
-    auto* input_max_data = input_max_t.mutable_data<T>({1}, dev_ctx.GetPlace());
+    input_min_t.Resize({1});
+    input_max_t.Resize({1});
+    auto* input_min_data = dev_ctx.template Alloc<T>(&input_min_t);
+    auto* input_max_data = dev_ctx.template Alloc<T>(&input_max_t);
     auto input_min_scala = phi::EigenScalar<T>::From(input_min_t);
     auto input_max_scala = phi::EigenScalar<T>::From(input_max_t);
 
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 1f6dc489690535c2ae1eba91563c7d40552bd370..b04cea2ceb55ea48163e7f345c682b5509947718 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -67,7 +67,7 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
   DenseTensor temp_values, temp_indices;
   const T* input = input_tensor->data<T>();
   T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* indices = dev_ctx.template Alloc<int64_t>(indices_tensor);
   temp_values.Resize(dim);
   temp_indices.Resize(dim);
   sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
@@ -208,13 +208,16 @@ void KthvalueKernel(const Context& dev_ctx,
     }
     trans_out_dims[in_dims.size() - 1] = 1;
     DenseTensor trans_input;
-    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
     int ndims = trans.size();
     funcs::TransCompute<phi::GPUContext, T>(
         ndims, dev_ctx, x, &trans_input, trans);
     DenseTensor trans_ind, trans_out;
-    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
-    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
     const int64_t input_height =
         phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index 16c32886e235a1ab50caac657920806fec0e86fc..fcf43f9f42718d8bb035977c7e0e4ccc9f3c5c66 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -50,7 +50,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   }
   if (value_grad) {
     value_grad->Resize(index.dims());
-    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(value_grad);
     if (index_type == DataType::INT32) {
       paddle::operators::gpu_gather_kernel<T, int32_t>(
           out_grad,
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 59c5988986360849c8158d704a9b74cb64e17952..3c82726662de70c3440bcab427ee42d722700058 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -58,7 +58,7 @@ class RNNDescriptors {
 
   template <typename T>
   void Create(const gpuDnnHandle_t &handle,
-              const Place &place,
+              const DeviceContext &dev_ctx,
               const std::vector<int> &sequence_length,
               size_t *workspace_size,
               size_t *reserve_size,
@@ -103,17 +103,15 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
 #endif
+      dropout_state->Resize({static_cast<int64_t>(state_size)});
+      dev_ctx.template Alloc<uint8_t>(dropout_state);
     }
     dropout_desc_.descriptor(handle,
-                             place,
+                             dev_ctx.GetPlace(),
                              is_initialized,
                              dropout_prob_,
                              is_test_ ? nullptr : dropout_state,
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index fe0446323739f8bb3977806211eb5bd9195ea66e..ff1d295b11e6817a63cf5e0ca2967b768afcb04c 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -248,7 +248,7 @@ void RnnGradKernel(const Context &dev_ctx,
                      is_test);
 
   rnn.Create<T>(handle,
-                dev_ctx.GetPlace(),
+                dev_ctx,
                 SequenceLength,
                 &workspace_size,
                 &reserve_size,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 079a159ee81e9f163e43f7e8bf3a762c63c044ae..44fab87d910d76e8a468480d5725290f0be63a60 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -280,7 +280,7 @@ void RnnKernel(const Context &dev_ctx,
                      is_bidirec,
                      is_test);
   rnn.Create<T>(handle,
-                dev_ctx.GetPlace(),
+                dev_ctx,
                 SequenceLength,
                 &workspace_size,
                 &reserve_size,
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index e3f0bf968c82cc754b31b07732f8e2314d6f2830..b7cf9e5badce0c86470e84570e5bd1f5bb34c439 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -82,9 +82,8 @@ void SGDDenseKernel(const Context& dev_ctx,
   const MPDType* master_in_data =
       multi_precision ? master_param->data<MPDType>() : nullptr;
   MPDType* master_out_data =
-      multi_precision
-          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
-          : nullptr;
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out)
+                      : nullptr;
 
   int block = 512;
   int grid = (param.numel() + block - 1) / block;
@@ -94,7 +93,7 @@ void SGDDenseKernel(const Context& dev_ctx,
       grad.data<T>(),
       learning_rate.data<T>(),
       param.numel(),
-      param_out->mutable_data<T>(dev_ctx.GetPlace()),
+      dev_ctx.template Alloc<T>(param_out),
       master_in_data,
       master_out_data);
 }
@@ -119,9 +118,8 @@ void SGDDenseParamSparseGradKernel(
   const MPDType* master_in_data =
       multi_precision ? master_param->data<MPDType>() : nullptr;
   MPDType* master_out_data =
-      multi_precision
-          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
-          : nullptr;
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out)
+                      : nullptr;
 
   PADDLE_ENFORCE_EQ(
       &param,
diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
index b4a1574ee84e849833cf33ab9879421d4fbd5220..cc5d95a12f7a3c7ec13824dd0d0dc3c6b9c621fc 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
@@ -120,8 +120,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
       (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                         : phi::make_ddim({nt, h, w, c}));
   const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  input_grad->Resize(in_grad_dims);
+  T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
 
   int pixelNum = nt * chw;
   int threads = 1024;
diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
index c69a8aa2882169d0bae972e5c7589ddeebef46e1..b321fad07ac1fdb2b015edeeaee1475a75255888 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
@@ -120,7 +120,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
       (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                         : phi::make_ddim({nt, h, w, c}));
   const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  output->Resize(out_dims);
+  T* output_data = dev_ctx.template Alloc<T>(output);
 
   int pixelNum = nt * chw;
   int threads = 1024;
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
index 8baf339f0c6d72ce1223851b5d405ef49ada8aff..a55834c6ae7aa5e3809ecc1faabc4be98e08e732 100644
--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -139,9 +139,10 @@ void YoloBoxKernel(const Context& dev_ctx,
 
   const T* input_data = input->data<T>();
   const int* imgsize_data = img_size.data<int>();
-  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
-  T* scores_data =
-      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  boxes->Resize({n, box_num, 4});
+  T* boxes_data = dev_ctx.template Alloc<T>(boxes);
+  scores->Resize({n, box_num, class_num});
+  T* scores_data = dev_ctx.template Alloc<T>(scores);
   phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
   set_zero(dev_ctx, boxes, static_cast<T>(0));
   set_zero(dev_ctx, scores, static_cast<T>(0));
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 49046dfa4d20dd914ff6410f189139ab8576f30e..160e100f2b449d6f16771b35a55431bf32a52d2f 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -42,7 +42,7 @@ void DigammaGradKernel(const Context& ctx,
                        const DenseTensor& x,
                        const DenseTensor& out_grad,
                        DenseTensor* x_grad) {
-  x_grad->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(x_grad);
 
   auto* dout_data = out_grad.data<T>();
   auto* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h
index 4547806a38ddbab1d7c6b40b0411a3fddfd4c779..ded77ca5a8f314d58d3c9cef110265e773831415 100644
--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -38,7 +38,7 @@ struct DigammaFunctor {
 
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(out);
   auto* x_data = x.data<T>();
   auto* out_data = out->data<T>();
   auto numel = x.numel();
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 28387975e6e9982dcb3d025d8c3aff2601526fd0..396f1e9548648aba99b9c111a861bc884493c143 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -68,7 +68,7 @@ void AddDoubleGradImpl(const Context& dev_ctx,
     funcs::GetDoubleGradSafeTensor<Context, T>(
         dev_ctx, y, ddy.get_ptr(), &ddy_safe);
 
-    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddout);
     auto ddx_dims = ddx_safe.dims();
     auto ddy_dims = ddy_safe.dims();
     if (ddx_dims.size() >= ddy_dims.size()) {
@@ -102,7 +102,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
     funcs::GetDoubleGradSafeTensor<Context, T>(
         dev_ctx, y, ddy.get_ptr(), &ddy_safe);
 
-    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddout);
     funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
         dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
   }
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index acc87dc9960d1c78962b0704ba1f34be4e56f49c..521edc26af320c1d764e3ef1f4627bb309ae3452 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -25,32 +25,6 @@ namespace phi {
 
 static constexpr size_t WAIT_THRESHOLD = 64 * 1024;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <>
-void MemcpyH2DKernel(const GPUContext& dev_ctx,
-                     const DenseTensor& x,
-                     int dst_place_type,
-                     DenseTensor* out) {
-  PADDLE_ENFORCE_GE(
-      dst_place_type,
-      0,
-      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
-                         dst_place_type));
-  PADDLE_ENFORCE_LE(
-      dst_place_type,
-      3,
-      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
-                         dst_place_type));
-
-  auto stream = dev_ctx.stream();
-  out->mutable_data(dev_ctx.GetPlace(),
-                    x.dtype(),
-                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-
-  Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
-}
-#endif
-
 template <typename Context>
 void MemcpyH2DKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -77,10 +51,6 @@ void MemcpyD2HKernel(const Context& dev_ctx,
                      DenseTensor* out) {
   switch (dst_place_type) {
     case 0:
-      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
-      // alloc and assign DeviceContext.place to out, which causes place check
-      // fails. So we specify out's place here.
-      out->mutable_data(CPUPlace());
       Copy(dev_ctx, x, CPUPlace(), false, out);
       // NOTE(copy from Aurelius84): host <-> device memory copies of a memory
       // block of 64 KB or less are asynchronous. See
@@ -91,10 +61,6 @@ void MemcpyD2HKernel(const Context& dev_ctx,
       break;
 
     case 1:
-      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
-      // alloc and assign DeviceContext.place to out, which causes place check
-      // fails. So we specify out's place here.
-      out->mutable_data(GPUPinnedPlace());
       Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
       // paddle::memory::Copy use async copy for GPUPinnedPlace
       dev_ctx.Wait();
diff --git a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
index a25cd0cd61303fe22ed99074b945c1c02a2deaaa..9dd8f7df08ccc356d8f3316d34702a27c32c9152 100644
--- a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
@@ -74,7 +74,7 @@ void AddGradKernel(const Context& dev_ctx,
   }
 
   if (dy != nullptr) {
-    T* dy_data = dy->mutable_data<T>(dev_ctx.GetPlace());
+    T* dy_data = dev_ctx.template Alloc<T>(dy);
     if (dy->dims() == dz_dims) {
       if (dy_data != dz_data) {
         int ret = xpu::copy(dev_ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index c5fca8881e221e37e04af4c3d116173c790b7e76..44c5842210b71b5f1705c9b16a11f8800170053d 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -45,7 +45,7 @@ void TensorSetConstantXPU(phi::DenseTensor* tensor,
 
 template <typename T, typename Context, typename VType>
 void FullValueXPU(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  tensor->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(tensor);
 
   PD_VISIT_ALL_TYPES(tensor->dtype(), "FullValueXPU", ([&] {
                        TensorSetConstantXPU<VType, data_t>(
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index f1aa530e870705bd76fcedd044c22bc9483504c1..63c204af017586d140a47226a73097b8783316b5 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -342,17 +342,17 @@ if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 chenwhql YuanRisheng zyfncg
 fi
 
-HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
-PHI_USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
-    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
-    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_OR_FLUID_FILES=`git diff --name-only upstream/$BRANCH | grep -E "paddle/phi|paddle/fluid" || true`
+USE_MUTABLE_DATA_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_OR_FLUID_FILES}; do
+    ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
+    if [ "${ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        USE_MUTABLE_DATA_FILES="${USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
     fi
 done
-if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng or zyfncg) review and approve.\n"
-    check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg
+if [ "${USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You can not use the DenseTensor::mutable_data() method in files(${USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng, zyfncg or From00) review and approve.\n"
+    check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg From00
 fi
 
 ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`