diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index c6a773ebe5fc77600dea614129c1163a69c504a7..ad3e85d4696b793ec8a0ac09c94f9fbd17872188 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace paddle {
 namespace experimental {
 
@@ -169,8 +167,8 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   VLOG(3) << "DeviceTransform in, src_place " << tensor.place()
           << " dst_place: " << dst_place;
 
+  auto& pool = phi::DeviceContextPool::Instance();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (!platform::is_cuda_pinned_place(tensor.place())) {
     pool.Get(tensor.place())->Wait();
@@ -188,7 +186,13 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   // But the embarrassment is that this solution this solution makes training
   // slower.
   phi::DenseTensor out;
-  paddle::framework::TensorCopySync(tensor, dst_place, &out);
+  phi::DeviceContext* dev_ctx;
+  if (dst_place.GetType() != AllocationType::CPU) {
+    dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(tensor.place());
+  }
+  phi::Copy(*dev_ctx, tensor, dst_place, true, &out);
   return out;
 }
 
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 3b2314b0963cf6ba6f53f7758bbe4c3e9e551fa8..e90cdc9e0663abe516c932d49b4572debe420c4b 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE)
   cc_test(
     custom_device_test
     SRCS custom/custom_device_test.cc
-    DEPS phi_backends phi_device_context gradient_accumulator)
+    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
   cc_test(
     capi_test
     SRCS custom/capi_test.cc
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index d96cb3e895a1ae52034d1f243ed8cd3664cc906c..bbe40970f75bc0077eee5b7e4f27611eef297711 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -15,8 +15,8 @@ endif()
 cc_library(
   scalar
   SRCS scalar.cc
-  DEPS phi_enforce tensor)
+  DEPS phi_enforce phi_tensor_utils)
 cc_library(
   int_array
   SRCS int_array.cc
-  DEPS phi_enforce tensor)
+  DEPS phi_enforce phi_tensor_utils)
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
index 4aadae48c158fecf3335e2a96b324984eea2cf6c..707d4513df5f3ebf0e2c7700258dec0ae64aad64 100644
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
 namespace experimental {
@@ -28,7 +30,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
     AssignDataFromTensor(tensor);
   } else {
     phi::DenseTensor tensor_tmp;
-    paddle::framework::TensorCopySync(tensor, CPUPlace(), &tensor_tmp);
+    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+    auto dev_ctx = pool.Get(tensor.place());
+    phi::Copy(*dev_ctx, tensor, CPUPlace(), true, &tensor_tmp);
     AssignDataFromTensor(tensor_tmp);
   }
 }
@@ -45,8 +49,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
           array_.push_back(*tensor_list[i].template data<int32_t>());
         } else {
           phi::DenseTensor tensor_tmp;
-          paddle::framework::TensorCopySync(
-              tensor_list[i], CPUPlace(), &tensor_tmp);
+          phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+          auto dev_ctx = pool.Get(tensor_list[i].place());
+          phi::Copy(*dev_ctx, tensor_list[i], CPUPlace(), true, &tensor_tmp);
           array_.push_back(*tensor_tmp.template data<int32_t>());
         }
         break;
@@ -55,8 +60,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
           array_.push_back(*tensor_list[i].template data<int64_t>());
         } else {
           phi::DenseTensor tensor_tmp;
-          paddle::framework::TensorCopySync(
-              tensor_list[i], CPUPlace(), &tensor_tmp);
+          phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+          auto dev_ctx = pool.Get(tensor_list[i].place());
+          phi::Copy(*dev_ctx, tensor_list[i], CPUPlace(), true, &tensor_tmp);
           array_.push_back(*tensor_tmp.template data<int64_t>());
         }
         break;
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index b558739418d71111d60d6a44e609c66fa7666056..1b161d9ac6088e65abf204458ee000a92ff0bf64 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace experimental {
 
@@ -31,9 +33,11 @@ ScalarBase<phi::DenseTensor>::ScalarBase(const phi::DenseTensor& tensor_in)
                         "now Tensor has `%d` elements",
                         tensor_in.numel()));
   auto cpu_place = phi::CPUPlace();
-  if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) {
+  if (tensor_in.place().GetType() != phi::AllocationType::CPU) {
     phi::DenseTensor tensor;
-    framework::TensorCopySync(tensor_in, cpu_place, &tensor);
+    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+    auto dev_ctx = pool.Get(tensor_in.place());
+    phi::Copy(*dev_ctx, tensor_in, cpu_place, true, &tensor);
     GetDataFromTensor(tensor);
   } else {
     GetDataFromTensor(tensor_in);
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 379558b0b5de626afe984134cea444da20595088..79f4388c096bdef6c1702aaa21fecac64e3cdda8 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -36,7 +36,7 @@ void Copy(const Context& dev_ctx,
   const auto& src_place = src.place();
 
   if (&src == dst) {
-    if (paddle::platform::is_same_place(src_place, dst_place)) {
+    if (src_place.GetType() == dst_place.GetType()) {
       VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place
               << " to " << dst_place;
     } else {
@@ -54,24 +54,24 @@ void Copy(const Context& dev_ctx,
   dst->Resize(src.dims());
 
   void* dst_ptr = nullptr;
-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
     dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 #ifdef PADDLE_WITH_MKLDNN
     dst->set_layout(src.layout());
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if (paddle::platform::is_gpu_place(dst_place) ||
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::GPU ||
+             dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
-        dst, src.dtype(), 0, paddle::platform::is_cuda_pinned_place(dst_place));
+        dst, src.dtype(), 0, dst_place.GetType() == AllocationType::GPUPINNED);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-  } else if (paddle::platform::is_xpu_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::XPU) {
     dst_ptr = dev_ctx.Alloc(dst, src.dtype());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (paddle::platform::is_custom_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::CUSTOM) {
     dst_ptr = dev_ctx.Alloc(dst, src.dtype());
 #endif
   }
@@ -98,22 +98,22 @@ void Copy(const Context& dev_ctx,
   VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
   CHECK(dst->layout() == src.layout());
 
-  if (paddle::platform::is_cpu_place(src_place) &&
-      paddle::platform::is_cpu_place(dst_place)) {
+  if (src_place.GetType() == AllocationType::CPU &&
+      dst_place.GetType() == AllocationType::CPU) {
     paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if ((paddle::platform::is_cpu_place(src_place) ||
-              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
-             (paddle::platform::is_cpu_place(dst_place) ||
-              paddle::platform::is_cuda_pinned_place(dst_place))) {
+  } else if ((src_place.GetType() == AllocationType::CPU ||
+              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
+             (dst_place.GetType() == AllocationType::CPU ||
+              dst_place.GetType() == AllocationType::GPUPINNED)) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
     auto src_gpu_place = src_place;
     auto dst_cpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
         true,
         errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
@@ -131,14 +131,14 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if ((paddle::platform::is_cpu_place(src_place) ||
-              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
+  } else if ((src_place.GetType() == AllocationType::CPU ||
+              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPU) {
     auto src_cpu_place = src_place;
     auto dst_gpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
         true,
         errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
@@ -156,13 +156,13 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPU) {
     auto src_gpu_place = src_place;
     auto dst_gpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
         true,
         errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
@@ -170,20 +170,16 @@ void Copy(const Context& dev_ctx,
     auto stream =
         blocking ? nullptr
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    if (paddle::platform::is_same_place(src_place, dst_place)) {
+    if (src_place.GetType() == dst_place.GetType()) {
       paddle::memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
     } else {
-      if (paddle::platform::is_same_place(ctx_place, src_place)) {
+      if (ctx_place.GetType() == src_place.GetType()) {
         paddle::memory::Copy(
             dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-        paddle::platform::DeviceContextPool::Instance()
-            .Get(src.place())
-            ->Wait();
-      } else if (paddle::platform::is_same_place(ctx_place, dst_place)) {
-        paddle::platform::DeviceContextPool::Instance()
-            .Get(src.place())
-            ->Wait();
+        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (ctx_place.GetType() == dst_place.GetType()) {
+        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
         paddle::memory::Copy(
             dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
       } else {
@@ -191,13 +187,13 @@ void Copy(const Context& dev_ctx,
             "Context place dose not match the source and destination place."));
       }
     }
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPUPINNED) {
     auto src_gpu_place = src_place;
     auto dst_cuda_pinned_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
         true,
         errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
@@ -217,14 +213,14 @@ void Copy(const Context& dev_ctx,
         dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
 #endif
 #ifdef PADDLE_WITH_XPU
-  } else if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::XPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CPU &&
+             dst_place.GetType() == AllocationType::XPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_xpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::XPU &&
+             dst_place.GetType() == AllocationType::XPU) {
     if (src_ptr == dst_ptr) {
       VLOG(3) << "Skip copy the same data async from " << src_place << " to "
               << dst_place;
@@ -233,32 +229,26 @@ void Copy(const Context& dev_ctx,
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (paddle::platform::is_custom_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
     auto stream =
         blocking
             ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_custom_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CUSTOM) {
     auto stream =
         blocking
             ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_custom_place(src_place) &&  // NOLINT
-             paddle::platform::is_custom_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == AllocationType::CUSTOM) {
     auto stream =
         blocking
             ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
 #endif
   } else {
@@ -435,11 +425,11 @@ void TensorFromVector(const std::vector<T>& src,
   auto dst_ptr = static_cast<void*>(dst->data<T>());
   auto size = src.size() * sizeof(T);
 
-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -450,7 +440,7 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -461,7 +451,7 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
@@ -490,11 +480,11 @@ void TensorFromVector(const std::vector<bool>& src,
   auto dst_ptr = ctx.template Alloc<bool>(dst);
   auto size = src.size() * sizeof(bool);
 
-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_CUDA
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -505,13 +495,13 @@ void TensorFromVector(const std::vector<bool>& src,
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
     auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
   }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
@@ -583,11 +573,11 @@ void TensorFromArray(const T* src,
   auto dst_ptr = static_cast<void*>(dst->data<T>());
   auto size = array_size * sizeof(T);
 
-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -598,7 +588,7 @@ void TensorFromArray(const T* src,
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -609,7 +599,7 @@ void TensorFromArray(const T* src,
   }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #endif
@@ -684,11 +674,11 @@ void TensorToVector(const phi::DenseTensor& src,
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(dst->data());
 
-  if (paddle::platform::is_cpu_place(src.place())) {
+  if (src.place().GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -699,12 +689,12 @@ void TensorToVector(const phi::DenseTensor& src,
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  else if (paddle::platform::is_xpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
     paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
     paddle::memory::Copy(
         dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
@@ -728,11 +718,11 @@ void TensorToVector(const phi::DenseTensor& src,
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(array);
 
-  if (paddle::platform::is_cpu_place(src.place())) {
+  if (src.place().GetType() == AllocationType::CPU) {
     paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     paddle::memory::Copy(
         dst_place,
         dst_ptr,
@@ -743,12 +733,12 @@ void TensorToVector(const phi::DenseTensor& src,
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  else if (paddle::platform::is_xpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
     paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
     paddle::memory::Copy(
         dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
@@ -805,7 +795,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
   auto dst_ptr = static_cast<void*>(dst->data());
 
   PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(src.place()),
+      src.place().GetType() == AllocationType::CPU,
       true,
       phi::errors::InvalidArgument(
           "The input tensor should be CPU device, but actually it is in %s.",
@@ -821,12 +811,12 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
 
   bool* array = new bool[src.numel()];
 
-  paddle::platform::CPUPlace dst_place{};
+  phi::CPUPlace dst_place{};
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(array);
 
   PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(src.place()),
+      src.place().GetType() == AllocationType::CPU,
       true,
       phi::errors::InvalidArgument(
           "The input tensor should be CPU device, but actually it is in %s.",
@@ -891,7 +881,7 @@ phi::DenseTensor ReshapeToMatrix(const phi::DenseTensor& src,
 template <typename T>
 T GetValue(const phi::DenseTensor* x) {
   T value = static_cast<T>(0);
-  if (!paddle::platform::is_cpu_place(x->place())) {
+  if (x->place().GetType() != AllocationType::CPU) {
     phi::DenseTensor cpu_x{};
     phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
     phi::DeviceContext* dev_ctx = pool.Get(x->place());
diff --git a/paddle/phi/kernels/cpu/amp_kernel.cc b/paddle/phi/kernels/cpu/amp_kernel.cc
index 23048ba337df888f8aafa957c473840bec8d45fa..7625339042589c89e670e50392d9c251c4aed32c 100644
--- a/paddle/phi/kernels/cpu/amp_kernel.cc
+++ b/paddle/phi/kernels/cpu/amp_kernel.cc
@@ -24,8 +24,6 @@
 #include "paddle/phi/kernels/isfinite_kernel.h"
 #include "paddle/phi/kernels/reduce_all_kernel.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {
 
 // Utils
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 49555410f99201ebec6adf8b0708c8f0ab4f8b9f..c2da486e9f7521b21f2953977738d52c2e5ddf87 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -163,7 +162,7 @@ void BatchNormGradRawKernel(const Context& ctx,
   }
 
   if (d_x && (N * sample_size) == 1 && !use_global_stats) {
-    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    phi::Copy(ctx, *d_y, ctx.GetPlace(), false, d_x);
     return;
   }
 
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
index 332df1d9f137ebf82db97af1dad24d56d85d8c91..8768b78c6ff07abee91b6648a955159f31533d09 100644
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -106,7 +105,7 @@ void BatchNormKernel(const Context& ctx,
     if ((N * sample_size) == 1) {
       // Only 1 element in normalization dimension,
       // we skip the batch norm calculation, let y = x.
-      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      phi::Copy(ctx, x, ctx.GetPlace(), false, y);
       return;
     }
 
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 0d549ae46e2170ef202a323417c073a9b631e2e9..ba257be5e2c4e5fab43f75f4cb3f7a69cc419da0 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
index a37efa2d3ccdbf8ad8a385fbe51c72c5162862ea..55e25ffca4c8ca3141586b69e0e57d14d526e95f 100644
--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/cross_kernel.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index 13a97b4f5644e50bdc892956d3c4efc41cffb096..1cdde3a7b1e1a1220e091bdc136f187dcbaeb870 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -572,7 +572,7 @@ static void Interpolate1DCPUFwd(
   dev_ctx.template Alloc<T>(output);
 
   if (in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
     return;
   }
 
@@ -702,7 +702,7 @@ static void Interpolate2DCPUFwd(
   dev_ctx.template Alloc<T>(output);
 
   if (in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
     return;
   }
 
@@ -897,7 +897,7 @@ static void Interpolate3DCPUFwd(
   dev_ctx.template Alloc<T>(output);
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
     return;
   }
 
diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h
index 4edc83ca30a28f6aa233f58bb9710f7a62870df6..e508c11030a64ed2b95d7b95a1eaf7e7ac585ada 100644
--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -23,7 +23,6 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memcpy.h"
 #endif
 
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 53b0577fc29d776a38a927d095f4bf3db88a0fdf..2a11be43b5df696a02fac6942293f5448c315bc1 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -83,8 +82,10 @@ inline std::vector<int> get_new_shape(
     const std::vector<const DenseTensor*>& list_new_shape_tensor) {
   // get tensor from
   std::vector<int> vec_new_shape;
+  auto& pool = phi::DeviceContextPool::Instance();
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
+    phi::DeviceContext* dev_ctx = pool.Get(tensor->place());
     PADDLE_ENFORCE_EQ(tensor->dims() == phi::make_ddim({1}) ||
                           tensor->dims() == phi::make_ddim({}),
                       true,
@@ -96,15 +97,14 @@ inline std::vector<int> get_new_shape(
 #ifdef PADDLE_WITH_XPU
     if (tensor->place().GetType() == phi::AllocationType::XPU) {
       DenseTensor temp;
-      paddle::framework::TensorCopySync(*tensor, phi::CPUPlace(), &temp);
+      phi::Copy(*dev_ctx, *tensor, phi::CPUPlace(), true, &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
       continue;
     }
 #endif
-    if (paddle::platform::is_gpu_place(tensor->place())) {
+    if (tensor->place().GetType() == phi::AllocationType::GPU) {
       DenseTensor temp;
-      paddle::framework::TensorCopySync(
-          *tensor, paddle::platform::CPUPlace(), &temp);
+      phi::Copy(*dev_ctx, *tensor, phi::CPUPlace(), true, &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
       vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
@@ -120,22 +120,24 @@ inline std::vector<T> get_new_data_from_tensor(
   std::vector<T> vec_new_data;
   auto* new_data = new_data_tensor->data<T>();
   DenseTensor cpu_starts_tensor;
+  auto& pool = phi::DeviceContextPool::Instance();
+  phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place());
   if (paddle::platform::is_gpu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
 #ifdef PADDLE_WITH_ASCEND_CL
   if (paddle::platform::is_npu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
 #endif
 #ifdef PADDLE_WITH_XPU
   if (paddle::platform::is_xpu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
 #endif
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 6f1cac49352e3a97420a1ed341ccc1119b4674bd..7a4143c875c5fcce6df7c5e93039fac60624f844 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
index 919663a75e6cc9efb7e57c377a19d0a9fcec4ff0..a17f698d431e24c419c7a9ad45177af0c3c9f69f 100644
--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -19,7 +19,6 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memory.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index a9cc8f591be7c3cb2e9122280fc99c684283f7fd..7acfd33e94a9a415d892440956a071479a9c4665 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -86,8 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
     ctx.template Alloc<T>(output_tensor);
     if (just_copy) {
       // Turns out to be a No-Op, simply copy tensors
-      paddle::framework::TensorCopy(
-          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+      phi::Copy(ctx, *input_tensor, ctx.GetPlace(), false, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
       funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
index a98fdfaa8fc10a12eb1bc8bc2f6053361909d4fc..698ec44e6123bd357c8606486da750d98a1aca18 100644
--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;
 #include <iterator>
 #include <random>
 
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 072c38b1303070b1bbd56ac43a761ae7858867b6..dcbf003281f24a8897d94390a5b81bd369e1f3e5 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -27,8 +27,8 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/generator.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -103,7 +103,7 @@ struct OneHotGenerator<GPUContext, T> {
     DenseTensor input_tensor;
     input_tensor.Resize(out->dims());
     ctx.template Alloc<T>(&input_tensor);
-    paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
+    phi::Copy(ctx, *out, ctx.GetPlace(), false, &input_tensor);
     funcs::set_constant(ctx, out, 0.0);
     OneHotCUDAKernel<T, thread_size>
         <<<block_size, thread_size, 0, ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 2510ff8a5453a598ee1c24f4c71dead6beea5edf..9aa5d55201c0b97088078bfe0f3a7f2ae9f6ba08 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -693,8 +693,7 @@ static void Interpolate1DCUDAFwd(
     }
     if (out_size) {
       DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
       auto size_data = sizes.data<int>();
       out_w = size_data[0];
     }
@@ -714,7 +713,7 @@ static void Interpolate1DCUDAFwd(
   auto output_data = dev_ctx.template Alloc<T>(output);
 
   if (in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
     return;
   }
 
@@ -834,8 +833,8 @@ static void Interpolate2DCUDAFwd(
     }
     if (out_size) {
       DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
+
       auto size_data = sizes.data<int>();
       out_h = size_data[0];
       out_w = size_data[1];
@@ -862,7 +861,7 @@ static void Interpolate2DCUDAFwd(
   auto output_data = dev_ctx.template Alloc<T>(output);
 
   if (in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
     return;
   }
 
@@ -1110,8 +1109,7 @@ static void Interpolate3DCUDAFwd(
     }
     if (out_size) {
       DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
       auto size_data = sizes.data<int>();
       out_d = size_data[0];
       out_h = size_data[1];
@@ -1144,7 +1142,7 @@ static void Interpolate3DCUDAFwd(
   auto output_data = dev_ctx.template Alloc<T>(output);
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
     return;
   }
 
diff --git a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
index e66632498f67029a8abda3ea499573f32272fc19..dfe162a270a9b59a8b12ec789c0ee63c447ac071 100644
--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -58,8 +57,7 @@ void MeshgridForward(const Context& ctx,
     view_shape[i] = shape[i];
 
     DenseTensor reshape_ins_tensor;
-    paddle::framework::TensorCopy(
-        *ins[i], ctx.GetPlace(), ctx, &reshape_ins_tensor);
+    phi::Copy(ctx, *ins[i], ctx.GetPlace(), false, &reshape_ins_tensor);
     DDim out_dims_reshape = phi::make_ddim(view_shape);
     reshape_ins_tensor.Resize(out_dims_reshape);
     DDim out_dims = phi::make_ddim(shape);
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index f1e8497004520f74f045cf1aa4ea4590f33656b2..c405061adbf5a829ebb450ddf01bd5450c614e06 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -18,7 +18,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"