diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
index 21ebae8487ffc3588034a8ea5feeab8ac1c47fa8..a7ed7d36eb1c41f688e875912c0a8648cb42cb03 100644
--- a/paddle/phi/kernels/funcs/batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -36,8 +36,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
@@ -47,7 +46,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
@@ -55,7 +54,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   }
 }
 
@@ -74,7 +73,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[3] = input->dims()[4];
     in_dims_vec[4] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
 
   } else if (dim == 2) {
     // input
@@ -85,7 +84,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
@@ -93,7 +92,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 2c9ee5ede010367697bb9477a536f807625fd02b..339c3536d7a7f476df0c2c46bf34ba48b73c07c3 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -359,8 +359,8 @@ void BatchNormGradRawKernel(const Context &ctx,
   }
 
   if (d_scale && d_bias) {
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+    ctx.template Alloc<BatchNormParamType<T>>(d_bias);
   }
 
   PADDLE_ENFORCE_EQ(
@@ -569,8 +569,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                   /*activationDesc=*/nullptr,
                   /*sizeInBytes=*/&workspace_size));
 
-      workspace_ptr = workspace_tensor.mutable_data(
-          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+      workspace_ptr = ctx.template Alloc<T>(&workspace_tensor);
 
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
@@ -594,12 +594,9 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*dBnScaleBiasDesc=*/bn_param_desc_,
               /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
               /*bnBiasData=*/nullptr,
-              /*dBnScaleData=*/d_scale
-                  ->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-              /*dBnBiasData=*/d_bias
-                  ->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
+              /*dBnScaleData=*/ctx.template Alloc<BatchNormParamType<T>>(
+                  d_scale),
+              /*dBnBiasData=*/ctx.template Alloc<BatchNormParamType<T>>(d_bias),
               /*epsilon=*/epsilon,
               /*savedMean=*/saved_mean_data,
               /*savedInvVariance=*/saved_var_data,
@@ -626,10 +623,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               H * W * D,
               epsilon,
               transformed_d_x.template data<T>(),
-              d_scale->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()));
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         } else {
           BNBackward<T,
                      block,
@@ -644,10 +639,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               H * W * D,
               epsilon,
               transformed_d_x.template data<T>(),
-              d_scale->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()));
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
@@ -682,10 +675,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                 ctx.template Alloc<T>(&transformed_d_x),
                 bn_param_desc_,
                 scale.template data<BatchNormParamType<T>>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
+                ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                ctx.template Alloc<BatchNormParamType<T>>(d_bias),
                 epsilon,
                 saved_mean_data,
                 saved_var_data));
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 49b550f51e60e1cf31658f0d50afebf929a54079..74a523f4ecf942422a1f6c5ca9f710dc0e9d4cbf 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -439,11 +439,11 @@ void BatchNormKernel(const Context &ctx,
     // Run training mode.
     // obtain running mean and running inv var, and there is no need
     // to initialize them.
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    ctx.template Alloc<BatchNormParamType<T>>(variance_out);
 
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
 
     if ((N * H * W * D) == 1) {
       // Only 1 element in normalization dimension,
@@ -497,10 +497,10 @@ void BatchNormKernel(const Context &ctx,
                   /*xDesc=*/data_desc_,
                   /*sizeInBytes=*/&reserve_space_size));
 
-      reserve_space_ptr = reserve_space->mutable_data(
-          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-      workspace_ptr = workspace_tensor.mutable_data(
-          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
+      reserve_space_ptr = ctx.template Alloc<T>(reserve_space);
+      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+      workspace_ptr = ctx.template Alloc<T>(&workspace_tensor);
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
               handle,
@@ -518,15 +518,11 @@ void BatchNormKernel(const Context &ctx,
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
               this_factor,
-              mean_out->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              variance_out->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
+              ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+              ctx.template Alloc<BatchNormParamType<T>>(variance_out),
               epsilon,
-              saved_mean->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              saved_variance->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
+              ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+              ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
               nullptr,
               workspace_ptr,
               workspace_size,
@@ -621,15 +617,11 @@ void BatchNormKernel(const Context &ctx,
                 scale.template data<BatchNormParamType<T>>(),
                 bias.template data<BatchNormParamType<T>>(),
                 this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
                 epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace())));
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
 #endif
       }
     }
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index b4a6fe337c8d21e37beb0d6e5219e1a5edf1f9e8..9c5e77d5fd84661cdcc53dffc8f92a954df81041 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -71,15 +71,15 @@ void ConvCudnnGradGradKernel(
   auto dW = filter_grad;
   auto dX = input_grad;
   if (ddO) {
-    ddO->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(ddO);
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(ctx, ddO, static_cast<T>(0));
   }
   if (dW) {
-    dW->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(dW);
   }
   if (dX) {
-    dX->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(dX);
   }
 
   // const T* x = X->data<T>();
@@ -131,7 +131,7 @@ void ConvCudnnGradGradKernel(
     }
     if (dX) {
       ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
-      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_dX_channel);
     }
 
   } else {
@@ -186,13 +186,13 @@ void ConvCudnnGradGradKernel(
     transformed_ddX.Resize(new_input_shape);
     transformed_dX.Resize(new_input_shape);
 
-    transformed_X.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_X);
 
     if (ddX) {
-      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_ddX);
     }
     if (dX) {
-      transformed_dX.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_dX);
     }
 
     // pad for input
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 64148e902fdb2123aa3f81846999b5d90f356cd6..a99a1e5f9471ed8cf2513c4690630ff24b00c284 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -58,10 +58,10 @@ void ConvCudnnGradKernel(const Context& ctx,
                          DenseTensor* input_grad,
                          DenseTensor* filter_grad) {
   if (input_grad) {
-    input_grad->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(input_grad);
   }
   if (filter_grad) {
-    filter_grad->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(filter_grad);
   }
 
   std::vector<int> dilations = dilations_t;
@@ -204,12 +204,12 @@ void ConvCudnnGradKernel(const Context& ctx,
     }
     DDim new_input_shape(make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
-    transformed_input.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_input);
 
     transformed_input_grad.Resize(new_input_shape);
 
     if (input_grad) {
-      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_input_grad);
     }
     // pad for input
     const int rank = transformed_input_channel.dims().size();
@@ -427,7 +427,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     if (use_addto) {
       DenseTensor temp_tensor(transformed_input_grad.type());
       temp_tensor.Resize(transformed_input_grad.dims());
-      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      T* temp_tensor_data = ctx.template Alloc<T>(&temp_tensor);
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
@@ -513,7 +513,7 @@ void ConvCudnnGradKernel(const Context& ctx,
         axes[i] = i;
       }
 
-      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_input_grad_channel);
       if (transformed_input_channel.dims().size() == 4) {
         paddle::operators::RemovePaddingSlice<Context, T, 4>(
             ctx,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 931b6d68845e27297784603c2427178eae6b6f7d..c2970cc8cde75169602de5eec9f0e1424b71a701 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -54,7 +54,7 @@ void ConvCudnnKernel(const Context& ctx,
                      int workspace_size_MB,
                      bool exhaustive_search_t,
                      DenseTensor* output) {
-  output->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(output);
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
@@ -170,7 +170,7 @@ void ConvCudnnKernel(const Context& ctx,
     }
     DDim new_input_shape(make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
-    transformed_input.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_input);
 
     const int rank = transformed_input_channel.dims().size();
     T pad_value(0.0);
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
index fbcebf371a61bd3d652888b5eaad56185499726b..bc0ed44e17a3346db42f2f858caceabb9d5351b7 100644
--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -129,7 +129,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   DenseTensor col_matrix;
   if (is_expand) {
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -143,7 +143,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   if (dX && ddW_in) {
     Tensor ddW;
     ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-    dX->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dX);
 
     DenseTensor transformed_dX(dX->type());
 
@@ -201,7 +201,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   // oH, oW)
   // dw convolution double grad:  im2col(vol2col) + gemm
   if (dW && ddX) {
-    dW->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dW);
     set_zero(dev_ctx, dW, static_cast<T>(0));
     DenseTensor dW_arr = *dW;
     dW_arr.Resize(filter_matrix_shape);
@@ -244,7 +244,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   // w/ddw(Cout, Cin, kh, kw)
   // ddy convolution double grad: im2col(vol2col) + gemm
   if (ddY) {
-    ddY->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddY);
 
     DenseTensor transformed_ddY(ddY->type());
     if (channel_last) {
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
index f1971aca800b59171a2e741dbebce6d8adaf7899..2deebb996a057a84bd5343be76969ce3a12e1aa1 100644
--- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -128,7 +128,7 @@ void ConvGradKernel(const Context& dev_ctx,
   DenseTensor col_matrix;
   if (is_expand) {
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -137,7 +137,7 @@ void ConvGradKernel(const Context& dev_ctx,
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
   if (input_grad) {
-    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(input_grad);
     DenseTensor transformed_input_grad(input_grad->type());
     if (channel_last) {
       ResizeToChannelFirst<Context, T>(
@@ -203,7 +203,7 @@ void ConvGradKernel(const Context& dev_ctx,
   }
 
   if (filter_grad) {
-    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(filter_grad);
     Tensor filter_grad_ = *filter_grad;
     filter_grad_.Resize(filter_matrix_shape);
     set_zero(dev_ctx, filter_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
index 1945468f02551b8e348687ae578c9f23a038b8ca..2ef2ed8af2809c453db4f5a8c20ed4e004bf64be 100644
--- a/paddle/phi/kernels/impl/conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -44,7 +44,7 @@ void ConvKernel(const Context& dev_ctx,
   // The filter will be reshaped in the calculations,
   // so here use an assignment operation,
   // that avoids modifying the variable in the Scope.
-  output->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(output);
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
@@ -115,7 +115,7 @@ void ConvKernel(const Context& dev_ctx,
   if (is_expand) {
     // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }