Replace mutable_data with DeviceContext.Alloc in phi kernels (#48500)

* Replace mutable_data with DeviceContext.Alloc in phi kernels * Fix CI errors * Fix CI errors * Fix CI errors, test=kunlun * Fix CI errors, test=kunlun * Handle rnn_functor * Update approvals

Replace mutable_data with DeviceContext.Alloc in phi kernels (#48500)
* Replace mutable_data with DeviceContext.Alloc in phi kernels * Fix CI errors * Fix CI errors * Fix CI errors, test=kunlun * Fix CI errors, test=kunlun * Handle rnn_functor * Update approvals
34a957e3 · Ruibiao Chen · GitHub · d1e2ba8a · 34a957e3 · 34a957e3
48 changed file
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -50,7 +50,7 @@ math_library(tree2col DEPS math_function)
 cc_test(
  selected_rows_functor_test
  SRCS selected_rows_functor_test.cc
-  DEPS selected_rows_functor)
+  DEPS allocator selected_rows_functor)
 cc_test(
  im2col_test
  SRCS im2col_test.cc

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -15,11 +15,15 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 TEST(selected_rows_functor, cpu_add) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> functor;
  int64_t height = 10;
  int64_t row_numel = 10;
@@ -109,6 +113,9 @@ TEST(selected_rows_functor, cpu_add) {
 TEST(selected_rows_functor, cpu_add_to) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> functor;
  int64_t height = 10;
  int64_t row_numel = 10;
@@ -198,6 +205,9 @@ TEST(selected_rows_functor, cpu_add_to) {
 TEST(selected_rows_functor, cpu_merge_average_float) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> functor;
  int64_t height = 10;
  int64_t row_numel = 10;
@@ -233,6 +243,9 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
 TEST(selected_rows_functor, cpu_merge_add_float) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> functor;
  int64_t height = 10;
  int64_t row_numel = 10;
@@ -269,6 +282,9 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 TEST(selected_rows_functor, cpu_merge_add_int) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, int> functor;
  int64_t height = 10;
  int64_t row_numel = 10;
@@ -305,6 +321,9 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 TEST(selected_rows_functor, cpu_merge_add_multi) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> set_const;
  int64_t height = 10;
@@ -354,6 +373,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> set_const;
  int64_t height = 10;
@@ -409,6 +431,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 TEST(selected_rows_functor, cpu_sum_to) {
  paddle::platform::CPUPlace cpu_place;
  phi::CPUContext ctx(cpu_place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(cpu_place)
+                       .get());
  phi::funcs::SetConstant<phi::CPUContext, float> functor;
  int64_t height = 10;
  int64_t row_numel = 10;

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -163,7 +163,7 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
    if (!disable_setting_default_stream_for_allocator) {
      instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
    }
-    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
+    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
    dev_ctx->SetPinnedAllocator(
        instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());

--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -148,7 +148,7 @@ struct DeviceContext::Impl {
    if (tensor->initialized() && tensor->place() != place) {
      ClearHolder(tensor);
    }
-    auto* allocator = tensor->numel() == 0
+    auto* allocator = tensor->numel() == 0 && requested_size == 0
                          ? zero_allocator_
                          : (pinned ? pinned_allocator_ : device_allocator_);
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -44,7 +44,7 @@ void ConcatKernel(const Context& dev_ctx,
  phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
  out->Resize(out_dims);
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
  // If axis is 0, the lod of the output is not the same as inputs.
  if (axis == 0 && x[0]->lod().size() > 0) {

--- a/paddle/phi/kernels/cpu/elementwise_grad.h
+++ b/paddle/phi/kernels/cpu/elementwise_grad.h
@@ -90,13 +90,11 @@ ElementwiseAddGrad(const CPUContext& ctx,
                   int axis = -1) {
  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
  if (dx) {
-    blas.VCOPY(
+    blas.VCOPY(dout.numel(), dout.data<T>(), ctx.template Alloc<T>(dx));
-        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
  }
  if (dy) {
-    blas.VCOPY(
+    blas.VCOPY(dout.numel(), dout.data<T>(), ctx.template Alloc<T>(dy));
-        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
  }
 }

--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -34,7 +34,7 @@ void HistogramKernel(const Context& dev_ctx,
  const T* input_data = input.data<T>();
  auto input_numel = input.numel();
-  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(output);
  phi::funcs::SetConstant<Context, int64_t>()(
      dev_ctx, output, static_cast<int64_t>(0));

--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -27,7 +27,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                            DenseTensor* x_grad) {
  auto* mask_data = mask.data<bool>();
  auto* input_data = out_grad.data<T>();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto* out_data = dev_ctx.template Alloc<T>(x_grad);
  int mask_size = mask.numel();
  int index = 0;

--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -48,7 +48,8 @@ void MaskedSelectKernel(const Context& dev_ctx,
  DDim out_dim{out_size};
  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(phi::CPUPlace());
+  auto out_data = dev_ctx.template HostAlloc<T>(out);
  int index = 0;
  for (int i = 0; i < mask_size; i++) {

--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -58,7 +58,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
  if (value_grad) {
    value_grad->Resize(index.dims());
-    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(value_grad);
    if (index_type == DataType::INT32) {
      paddle::operators::cpu_gather_kernel<T, int32_t>(
          out_grad, axis, index, *value_grad, dev_ctx);

--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
@@ -114,8 +114,9 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                        : phi::make_ddim({nt, h, w, c}));
  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
+  input_grad->Resize(in_grad_dims);
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
  if (data_layout == DataLayout::kNCHW) {
    TemporalShiftBwNCHW<T>(

--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
@@ -114,7 +114,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                        : phi::make_ddim({nt, h, w, c}));
  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  output->Resize(out_dims);
+  T* output_data = dev_ctx.template Alloc<T>(output);
  if (data_layout == DataLayout::kNCHW) {
    TemporalShiftFwNCHW<T>(

--- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -51,16 +51,19 @@ void YoloBoxKernel(const Context& dev_ctx,
  const int an_stride = (class_num + 5) * stride;
  DenseTensor anchors_;
-  auto anchors_data =
+  anchors_.Resize({an_num * 2});
-      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  auto anchors_data = dev_ctx.template Alloc<int>(&anchors_);
  std::copy(anchors.begin(), anchors.end(), anchors_data);
  const T* input_data = input->data<T>();
  const int* imgsize_data = imgsize->data<int>();
-  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  boxes->Resize({n, box_num, 4});
+  T* boxes_data = dev_ctx.template Alloc<T>(boxes);
  memset(boxes_data, 0, boxes->numel() * sizeof(T));
-  T* scores_data =
-      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  scores->Resize({n, box_num, class_num});
+  T* scores_data = dev_ctx.template Alloc<T>(scores);
  memset(scores_data, 0, scores->numel() * sizeof(T));
  T box[4];

--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -996,7 +996,7 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
                        DenseTensor *z) {
  std::vector<const DenseTensor *> ins = {&x, &y};
  std::vector<DenseTensor *> outs = {z};
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<OutType>(z);
  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
      dev_ctx, ins, &outs, axis, func);
 }

--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -104,7 +104,8 @@ DenseTensor Diagonal(const DeviceContext& context,
    DenseTensor diag;
    DDim diag_dims = phi::make_ddim(ret_dims);
    auto dig_stride = phi::stride(diag_dims);
-    auto diag_data = diag.mutable_data<T>(diag_dims, context.GetPlace());
+    diag.Resize(diag_dims);
+    auto diag_data = context.template Alloc<T>(&diag);
    int64_t pos = std::abs(offset) * offset_stride;
    int64_t dim_size = ret_strides.size();

--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -474,7 +474,7 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
  } else {
    auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
    *ddx_safe = phi::Empty(dev_ctx, std::move(meta));
-    ddx_safe->mutable_data(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddx_safe);
    SetConstant<DeviceContext, T> set_zero;
    set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
  }

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -237,7 +237,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
  // result.
  if (dx && dx->IsSharedBufferWith(dout)) {
    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+    dx->Resize(x_dims);
+    ctx.template Alloc<T>(dx);
  }
  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
@@ -1680,7 +1681,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
  // result.
  if (dx && dx->IsSharedBufferWith(dout)) {
    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+    dx->Resize(x_dims);
+    ctx.template Alloc<T>(dx);
  }
  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"

--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -39,8 +39,11 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
    const int NN = N + 4;
    const int KK = K + 4;
    phi::DenseTensor X1;
-    T* X1_data = X1.mutable_data<T>({M * KK}, paddle::platform::CPUPlace());
+    X1.Resize({M * KK});
-    Y1_data = Y1.mutable_data<T>({M * (N + 4)}, paddle::platform::CPUPlace());
+    T* X1_data = context.template HostAlloc<T>(&X1);
+    Y1.Resize({M * (N + 4)});
+    Y1_data = context.template HostAlloc<T>(&Y1);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -319,7 +319,9 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
                        size,
                        vector->numel()));
  phi::DenseTensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  one.Resize({in_dims[0]});
+  context.template Alloc<double>(&one);
  SetConstant<phi::GPUContext, double> set;
  set(context, &one, static_cast<double>(1.0));
  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
@@ -355,7 +357,9 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
                        in_dims[0],
                        vector->numel()));
  phi::DenseTensor one;
-  one.mutable_data<double>({size}, context.GetPlace());
+  one.Resize({size});
+  context.template Alloc<double>(&one);
  SetConstant<phi::GPUContext, double> set;
  set(context, &one, static_cast<double>(1.0));
  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(

--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -117,7 +117,7 @@ class ColwiseSum<phi::CPUContext, T> {
            size,
            out->numel()));
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -185,7 +185,7 @@ class RowwiseMean<phi::CPUContext, T> {
            height,
            out->numel()));
    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -251,7 +251,7 @@ class RowwiseSum<phi::CPUContext, T> {
            height,
            out->numel()));
-    T* out_buf = out->mutable_data<T>(out->place());
+    T* out_buf = context.template Alloc<T>(out);
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -451,7 +451,7 @@ void SelectKernel(const KPDevice &dev_ctx,
    out_dim.push_back(static_cast<int64_t>(rank));
    out->Resize(phi::make_ddim(out_dim));
  }
-  auto out_data = out->mutable_data<OutT>(cuda_place);
+  auto out_data = dev_ctx.template Alloc<OutT>(out);
  // 3.2 get true data's index according to cond_data and cumsum_data
  if (total_true_num <= 0) return;
  SelectKernel<MT, InT, CT, OutT, Functor, kVecSize, SelectData>

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -542,11 +542,10 @@ struct MergeAddImpl {
    }
    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
+    DenseTensor* out_tensor = out.mutable_value();
-        phi::make_ddim(
+    out_tensor->Resize(phi::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
-        context.GetPlace());
+    auto* out_data = context.template Alloc<T>(out_tensor);
-    auto* out_data = out.mutable_value()->data<T>();
    if (merged_row_set.size() == row_num && !sorted_result) {
      // no duplicated ids, just concat the result together
@@ -659,9 +658,10 @@ struct MergeAdd<phi::XPUContext, T> {
    out.set_rows(merge_rows);
    out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
+    DenseTensor* out_tensor = out.mutable_value();
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
+    out_tensor->Resize(
-        context.GetPlace());
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
    std::unordered_map<int64_t, size_t> rows_to_id;
    for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -748,12 +748,13 @@ struct MergeAdd<phi::XPUContext, T> {
    out.set_rows(merge_rows);
    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim(
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
-        context.GetPlace());
-    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    DenseTensor* out_tensor = out.mutable_value();
+    out_tensor->Resize(phi::make_ddim(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
+    float* y_data = reinterpret_cast<float*>(out_tensor->data<T>());
    std::unordered_map<int64_t, size_t> rows_to_id;
    for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -856,11 +857,11 @@ struct MergeAverage<phi::CPUContext, T> {
    }
    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim(
+    DenseTensor* out_tensor = out.mutable_value();
-            {static_cast<int64_t>(merged_row_set.size()), input_width}),
+    out_tensor->Resize(phi::make_ddim(
-        context.GetPlace());
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
-    auto* out_data = out.mutable_value()->data<T>();
+    auto* out_data = context.template Alloc<T>(out_tensor);
    std::vector<int64_t> merge_rows(merged_row_set.begin(),
                                    merged_row_set.end());

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -392,9 +392,10 @@ struct MergeAddImpl {
    out.set_rows(merge_rows);
    out.set_height(input.height());
-    out.mutable_value()->mutable_data<T>(
+    DenseTensor* out_tensor = out.mutable_value();
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
+    out_tensor->Resize(
-        context.GetPlace());
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
    phi::funcs::SetConstant<DeviceContext, T> constant_functor;
    constant_functor(context, out.mutable_value(), static_cast<T>(0));
@@ -462,9 +463,11 @@ struct MergeAddImpl {
    out.set_rows(merge_rows);
    out.set_height(input_height);
-    out.mutable_value()->mutable_data<T>(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
+    DenseTensor* out_tensor = out.mutable_value();
-        context.GetPlace());
+    out_tensor->Resize(
+        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    context.template Alloc<T>(out_tensor);
    phi::funcs::SetConstant<DeviceContext, T> constant_functor;
    constant_functor(context, out.mutable_value(), static_cast<T>(0));

--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -941,8 +941,7 @@ bool SortTopk(const phi::GPUContext& ctx,
  const std::vector<int64_t> dims = {num_rows, num_cols};
  auto dim = phi::make_ddim(dims);
  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
+  ctx.template Alloc<int64_t>(&input_indices);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
  size_t temp_storage_bytes = -1;
  auto ComputeBlockSize = [](int col) {
@@ -984,7 +983,7 @@ bool SortTopk(const phi::GPUContext& ctx,
  const T* input = input_tensor->data<T>();
  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+  int64_t* indices = ctx.template Alloc<int64_t>(indices_tensor);
  if (k == num_cols) {
    // Doing a full sort.
@@ -993,8 +992,8 @@ bool SortTopk(const phi::GPUContext& ctx,
  } else {
    temp_values.Resize(dim);
    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_values_ptr = ctx.template Alloc<T>(&temp_values);
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+    sorted_indices_ptr = ctx.template Alloc<int64_t>(&temp_indices);
  }
  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
@@ -1067,7 +1066,7 @@ bool SortTopk(const phi::GPUContext& ctx,
 #endif
  }
  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+  ctx.template Alloc<uint8_t>(&temp_storage, temp_storage_bytes);
  if (largest) {
    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(

--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx,
  phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
  out->Resize(out_dims);
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
  // If axis is 0, the lod of the output is not the same as inputs.
  if (axis == 0 && x[0]->lod().size() > 0) {

--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -1231,7 +1231,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
    const T* input_data = input.data<T>();
    const T* filter_data = filter.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* output_data = context.template Alloc<T>(output);
    phi::DenseTensor filter_hwc;
    if (data_layout == DataLayout::kNHWC) {
@@ -1240,7 +1240,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                                       filter.dims()[0],
                                       filter.dims()[1]});
      filter_hwc.Resize(filter_hwc_dims);
-      filter_hwc.mutable_data<T>(context.GetPlace());
+      context.template Alloc<T>(&filter_hwc);
      std::vector<int> perm_axis({2, 3, 0, 1});
      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
      trans(context, filter, &filter_hwc, perm_axis);
@@ -1409,7 +1409,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
    const T* input_data = input.data<T>();
    const T* filter_data = filter.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    T* input_grad_data = context.template Alloc<T>(input_grad);
    phi::DenseTensor filter_hwc;
    if (data_layout == DataLayout::kNHWC) {
@@ -1418,7 +1418,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                                       filter.dims()[0],
                                       filter.dims()[1]});
      filter_hwc.Resize(filter_hwc_dims);
-      filter_hwc.mutable_data<T>(context.GetPlace());
+      context.template Alloc<T>(&filter_hwc);
      std::vector<int> perm_axis({2, 3, 0, 1});
      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
      trans(context, filter, &filter_hwc, perm_axis);
@@ -1584,7 +1584,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
    const T* input_data = input.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
+    T* filter_grad_data = context.template Alloc<T>(filter_grad);
    int block_size = 512;
    int blocks;
@@ -1654,7 +1654,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
                                              filter_grad->dims()[0],          \
                                              filter_grad->dims()[1]});        \
        filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
-        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
+        context.template Alloc<T>(&filter_grad_hwc);                           \
        phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
        set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
        filter_grad_data = filter_grad_hwc.data<T>();                          \

--- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -75,7 +75,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
  phi::funcs::SetConstant<Context, T> set_zero;
  if (input_grad) {
-    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(input_grad);
    set_zero(dev_ctx, input_grad, static_cast<T>(0));
    if (fuse_relu) {
@@ -106,7 +106,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
  }
  if (filter_grad) {
-    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(filter_grad);
    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
    if (fuse_relu) {
      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>

--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -32,7 +32,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                         const std::string& data_format,
                         DenseTensor* out) {
  DenseTensor* output = out;
-  output->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(output);
  const std::vector<int> strides = strides_t;
  std::vector<int> dilations = dilations_t;

--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -29,7 +29,7 @@ void DropoutGradRawKernel(const Context& dev_ctx,
                          const std::string& mode,
                          DenseTensor* x_grad) {
  bool upscale_in_train = (mode == "upscale_in_train");
-  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(x_grad);
  paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
                                                   is_test,
                                                   p.to<float>(),

--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -153,7 +153,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
  // dx
  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto *dx_data = ctx.template Alloc<T>(dx);
    if (dx->dims() == dout.dims()) {
      if (dx_data != dout_data) {
        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
@@ -163,7 +163,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
      // the result of dy wrong.
      if (dx->IsSharedBufferWith(dout)) {
        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+        dx->Resize(x.dims());
+        ctx.template Alloc<T>(dx);
      }
      std::vector<int> reduce_dims =
          funcs::GetReduceDim(x.dims(), out.dims(), axis);
@@ -173,7 +174,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
  }
  // dy
  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    auto *dy_data = ctx.template Alloc<T>(dy);
    if (dy->dims() == dout.dims()) {
      if (dy_data != dout_data) {
        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
@@ -217,12 +218,11 @@ void ElementwiseAddGrad(const GPUContext &ctx,
                 PREDEFINED_BLOCK_SIZE,
             1);
    SimpleElemwiseAddGradCUDAKernel<T>
-        <<<grid_size, block_size, 0, ctx.stream()>>>(
+        <<<grid_size, block_size, 0, ctx.stream()>>>(dout.data<T>(),
-            dout.data<T>(),
+                                                     size,
-            size,
+                                                     vec_size,
-            vec_size,
+                                                     ctx.template Alloc<T>(dx),
-            dx->mutable_data<T>(ctx.GetPlace()),
+                                                     ctx.template Alloc<T>(dy));
-            dy->mutable_data<T>(ctx.GetPlace()));
  } else {
    VLOG(4) << "Special case when dy_data is the same as dout_data, "
               "and dx_data is the same as dout_data, do not need "
@@ -264,7 +264,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
  auto *dout_data = dout.data<T>();
  // dx
  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto *dx_data = ctx.template Alloc<T>(dx);
    if (dx->dims() == dout.dims()) {
      if (dx_data != dout_data) {
        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
@@ -274,7 +274,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
      // the result of dy wrong.
      if (dx->IsSharedBufferWith(dout)) {
        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+        dx->Resize(x.dims());
+        ctx.template Alloc<T>(dx);
      }
      std::vector<int> reduce_dims =
          funcs::GetReduceDim(x.dims(), out.dims(), axis);
@@ -284,7 +285,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
  }
  // dy
  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    auto *dy_data = ctx.template Alloc<T>(dy);
    if (dy->dims() == dout.dims()) {
      if (dy_data != dout_data) {
        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
@@ -293,10 +294,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
        SimpleElemwiseSubGradCUDAKernel<T>
            <<<grid_size, block_size, 0, ctx.stream()>>>(
-                dout.data<T>(),
+                dout.data<T>(), size, nullptr, ctx.template Alloc<T>(dy));
-                size,
-                nullptr,
-                dy->mutable_data<T>(ctx.GetPlace()));
      }
    } else {
      std::vector<int> reduce_dims =
@@ -320,11 +318,10 @@ void elementwise_sub_grad(const GPUContext &ctx,
  dim3 grid_size =
      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
  SimpleElemwiseSubGradCUDAKernel<T>
-      <<<grid_size, block_size, 0, ctx.stream()>>>(
+      <<<grid_size, block_size, 0, ctx.stream()>>>(dout.data<T>(),
-          dout.data<T>(),
+                                                   size,
-          size,
+                                                   ctx.template Alloc<T>(dx),
-          dx->mutable_data<T>(ctx.GetPlace()),
+                                                   ctx.template Alloc<T>(dy));
-          dy->mutable_data<T>(ctx.GetPlace()));
 }
 /*
 ******************************

--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -314,9 +314,9 @@ void GraphReindexKernel(const Context& dev_ctx,
    const auto* ph_index = hashtable_index.get_ptr();
    hashtable_index_out.ShareDataWith(*ph_index);
    int* hashtable_value_data =
-        hashtable_value_out.mutable_data<int>(dev_ctx.GetPlace());
+        dev_ctx.template Alloc<int>(&hashtable_value_out);
    int* hashtable_index_data =
-        hashtable_index_out.mutable_data<int>(dev_ctx.GetPlace());
+        dev_ctx.template Alloc<int>(&hashtable_index_out);
    BufferReindex<T, Context>(dev_ctx,
                              x_data,
                              src_outputs,

--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -85,7 +85,7 @@ void HistogramKernel(const Context& dev_ctx,
  const T* input_data = input.data<T>();
  const int input_numel = input.numel();
-  int64_t* out_data = output->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(output);
  phi::funcs::SetConstant<Context, int64_t>()(
      dev_ctx, output, static_cast<int64_t>(0));
@@ -98,8 +98,10 @@ void HistogramKernel(const Context& dev_ctx,
    auto input_x = phi::EigenVector<T>::Flatten(input);
    DenseTensor input_min_t, input_max_t;
-    auto* input_min_data = input_min_t.mutable_data<T>({1}, dev_ctx.GetPlace());
+    input_min_t.Resize({1});
-    auto* input_max_data = input_max_t.mutable_data<T>({1}, dev_ctx.GetPlace());
+    input_max_t.Resize({1});
+    auto* input_min_data = dev_ctx.template Alloc<T>(&input_min_t);
+    auto* input_max_data = dev_ctx.template Alloc<T>(&input_max_t);
    auto input_min_scala = phi::EigenScalar<T>::From(input_min_t);
    auto input_max_scala = phi::EigenScalar<T>::From(input_max_t);

--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -67,7 +67,7 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
  DenseTensor temp_values, temp_indices;
  const T* input = input_tensor->data<T>();
  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  int64_t* indices = dev_ctx.template Alloc<int64_t>(indices_tensor);
  temp_values.Resize(dim);
  temp_indices.Resize(dim);
  sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
@@ -208,13 +208,16 @@ void KthvalueKernel(const Context& dev_ctx,
    }
    trans_out_dims[in_dims.size() - 1] = 1;
    DenseTensor trans_input;
-    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
    int ndims = trans.size();
    funcs::TransCompute<phi::GPUContext, T>(
        ndims, dev_ctx, x, &trans_input, trans);
    DenseTensor trans_ind, trans_out;
-    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
+    trans_ind.Resize(trans_out_dims);
-    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
    const int64_t input_height =
        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
    const int64_t input_width = trans_dims[trans_dims.size() - 1];

--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -50,7 +50,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
  }
  if (value_grad) {
    value_grad->Resize(index.dims());
-    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(value_grad);
    if (index_type == DataType::INT32) {
      paddle::operators::gpu_gather_kernel<T, int32_t>(
          out_grad,

--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -58,7 +58,7 @@ class RNNDescriptors {
  template <typename T>
  void Create(const gpuDnnHandle_t &handle,
-              const Place &place,
+              const DeviceContext &dev_ctx,
              const std::vector<int> &sequence_length,
              size_t *workspace_size,
              size_t *reserve_size,
@@ -103,17 +103,15 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
      PADDLE_ENFORCE_GPU_SUCCESS(
          phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
 #else
      PADDLE_ENFORCE_GPU_SUCCESS(
          phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
 #endif
+      dropout_state->Resize({static_cast<int64_t>(state_size)});
+      dev_ctx.template Alloc<uint8_t>(dropout_state);
    }
    dropout_desc_.descriptor(handle,
-                             place,
+                             dev_ctx.GetPlace(),
                             is_initialized,
                             dropout_prob_,
                             is_test_ ? nullptr : dropout_state,

--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -248,7 +248,7 @@ void RnnGradKernel(const Context &dev_ctx,
                     is_test);
  rnn.Create<T>(handle,
-                dev_ctx.GetPlace(),
+                dev_ctx,
                SequenceLength,
                &workspace_size,
                &reserve_size,

--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -280,7 +280,7 @@ void RnnKernel(const Context &dev_ctx,
                     is_bidirec,
                     is_test);
  rnn.Create<T>(handle,
-                dev_ctx.GetPlace(),
+                dev_ctx,
                SequenceLength,
                &workspace_size,
                &reserve_size,

--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -82,9 +82,8 @@ void SGDDenseKernel(const Context& dev_ctx,
  const MPDType* master_in_data =
      multi_precision ? master_param->data<MPDType>() : nullptr;
  MPDType* master_out_data =
-      multi_precision
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out)
-          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+                      : nullptr;
-          : nullptr;
  int block = 512;
  int grid = (param.numel() + block - 1) / block;
@@ -94,7 +93,7 @@ void SGDDenseKernel(const Context& dev_ctx,
      grad.data<T>(),
      learning_rate.data<T>(),
      param.numel(),
-      param_out->mutable_data<T>(dev_ctx.GetPlace()),
+      dev_ctx.template Alloc<T>(param_out),
      master_in_data,
      master_out_data);
 }
@@ -119,9 +118,8 @@ void SGDDenseParamSparseGradKernel(
  const MPDType* master_in_data =
      multi_precision ? master_param->data<MPDType>() : nullptr;
  MPDType* master_out_data =
-      multi_precision
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out)
-          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+                      : nullptr;
-          : nullptr;
  PADDLE_ENFORCE_EQ(
      &param,

--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
@@ -120,8 +120,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                        : phi::make_ddim({nt, h, w, c}));
  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
+  input_grad->Resize(in_grad_dims);
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
  int pixelNum = nt * chw;
  int threads = 1024;

--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
@@ -120,7 +120,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
                                        : phi::make_ddim({nt, h, w, c}));
  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  output->Resize(out_dims);
+  T* output_data = dev_ctx.template Alloc<T>(output);
  int pixelNum = nt * chw;
  int threads = 1024;

--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -139,9 +139,10 @@ void YoloBoxKernel(const Context& dev_ctx,
  const T* input_data = input->data<T>();
  const int* imgsize_data = img_size.data<int>();
-  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  boxes->Resize({n, box_num, 4});
-  T* scores_data =
+  T* boxes_data = dev_ctx.template Alloc<T>(boxes);
-      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  scores->Resize({n, box_num, class_num});
+  T* scores_data = dev_ctx.template Alloc<T>(scores);
  phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
  set_zero(dev_ctx, boxes, static_cast<T>(0));
  set_zero(dev_ctx, scores, static_cast<T>(0));

--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -42,7 +42,7 @@ void DigammaGradKernel(const Context& ctx,
                       const DenseTensor& x,
                       const DenseTensor& out_grad,
                       DenseTensor* x_grad) {
-  x_grad->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(x_grad);
  auto* dout_data = out_grad.data<T>();
  auto* x_data = x.data<T>();

--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -38,7 +38,7 @@ struct DigammaFunctor {
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(out);
  auto* x_data = x.data<T>();
  auto* out_data = out->data<T>();
  auto numel = x.numel();

--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -68,7 +68,7 @@ void AddDoubleGradImpl(const Context& dev_ctx,
    funcs::GetDoubleGradSafeTensor<Context, T>(
        dev_ctx, y, ddy.get_ptr(), &ddy_safe);
-    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddout);
    auto ddx_dims = ddx_safe.dims();
    auto ddy_dims = ddy_safe.dims();
    if (ddx_dims.size() >= ddy_dims.size()) {
@@ -102,7 +102,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
    funcs::GetDoubleGradSafeTensor<Context, T>(
        dev_ctx, y, ddy.get_ptr(), &ddy_safe);
-    ddout->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddout);
    funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
        dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
  }

--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -25,32 +25,6 @@ namespace phi {
 static constexpr size_t WAIT_THRESHOLD = 64 * 1024;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <>
-void MemcpyH2DKernel(const GPUContext& dev_ctx,
-                     const DenseTensor& x,
-                     int dst_place_type,
-                     DenseTensor* out) {
-  PADDLE_ENFORCE_GE(
-      dst_place_type,
-      0,
-      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
-                         dst_place_type));
-  PADDLE_ENFORCE_LE(
-      dst_place_type,
-      3,
-      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
-                         dst_place_type));
-  auto stream = dev_ctx.stream();
-  out->mutable_data(dev_ctx.GetPlace(),
-                    x.dtype(),
-                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-  Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
-}
-#endif
 template <typename Context>
 void MemcpyH2DKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -77,10 +51,6 @@ void MemcpyD2HKernel(const Context& dev_ctx,
                     DenseTensor* out) {
  switch (dst_place_type) {
    case 0:
-      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
-      // alloc and assign DeviceContext.place to out, which causes place check
-      // fails. So we specify out's place here.
-      out->mutable_data(CPUPlace());
      Copy(dev_ctx, x, CPUPlace(), false, out);
      // NOTE(copy from Aurelius84): host <-> device memory copies of a memory
      // block of 64 KB or less are asynchronous. See
@@ -91,10 +61,6 @@ void MemcpyD2HKernel(const Context& dev_ctx,
      break;
    case 1:
-      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
-      // alloc and assign DeviceContext.place to out, which causes place check
-      // fails. So we specify out's place here.
-      out->mutable_data(GPUPinnedPlace());
      Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
      // paddle::memory::Copy use async copy for GPUPinnedPlace
      dev_ctx.Wait();

--- a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
@@ -74,7 +74,7 @@ void AddGradKernel(const Context& dev_ctx,
  }
  if (dy != nullptr) {
-    T* dy_data = dy->mutable_data<T>(dev_ctx.GetPlace());
+    T* dy_data = dev_ctx.template Alloc<T>(dy);
    if (dy->dims() == dz_dims) {
      if (dy_data != dz_data) {
        int ret = xpu::copy(dev_ctx.x_context(),

--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -45,7 +45,7 @@ void TensorSetConstantXPU(phi::DenseTensor* tensor,
 template <typename T, typename Context, typename VType>
 void FullValueXPU(const Context& dev_ctx, DenseTensor* tensor, VType val) {
-  tensor->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(tensor);
  PD_VISIT_ALL_TYPES(tensor->dtype(), "FullValueXPU", ([&] {
                       TensorSetConstantXPU<VType, data_t>(

--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -342,17 +342,17 @@ if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    check_approval 1 chenwhql YuanRisheng zyfncg
 fi
-HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
+HAS_MODIFIED_PHI_OR_FLUID_FILES=`git diff --name-only upstream/$BRANCH | grep -E "paddle/phi|paddle/fluid" || true`
-PHI_USE_MUTABLE_DATA_FILES=""
+USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_OR_FLUID_FILES}; do
-    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
+    ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
-    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    if [ "${ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
+        USE_MUTABLE_DATA_FILES="${USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
    fi
 done
-if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+if [ "${USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng or zyfncg) review and approve.\n"
+    echo_line="You can not use the DenseTensor::mutable_data() method in files(${USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng, zyfncg or From00) review and approve.\n"
-    check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg
+    check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg From00
 fi
 ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`