decouple tensor_utils (#50264)

fix X remove TensorCopy codestyle add fluid memory header fix symbol fix cmake fix cmake fix context fix header fix place fix context fix context fix context fix code fix custom context fix custom context fix copy fix data_transform fix style remove changes of custom fix scalar

decouple tensor_utils (#50264)
fix X remove TensorCopy codestyle add fluid memory header fix symbol fix cmake fix cmake fix context fix header fix place fix context fix context fix context fix code fix custom context fix custom context fix copy fix data_transform fix style remove changes of custom fix scalar
057cdb95 · engineer1109 · GitHub · fcb746cb · 057cdb95 · 057cdb95
22 changed file
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"

-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace paddle {
 namespace experimental {

@@ -169,8 +167,8 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
  VLOG(3) << "DeviceTransform in, src_place " << tensor.place()
          << " dst_place: " << dst_place;

+  auto& pool = phi::DeviceContextPool::Instance();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
  // NOTE(yy): TransDataPlace should wait for computation of input.
  if (!platform::is_cuda_pinned_place(tensor.place())) {
    pool.Get(tensor.place())->Wait();
@@ -188,7 +186,13 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
  // But the embarrassment is that this solution this solution makes training
  // slower.
  phi::DenseTensor out;
-  paddle::framework::TensorCopySync(tensor, dst_place, &out);
+  phi::DeviceContext* dev_ctx;
+  if (dst_place.GetType() != AllocationType::CPU) {
+    dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(tensor.place());
+  }
+  phi::Copy(*dev_ctx, tensor, dst_place, true, &out);
  return out;
 }


--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE)
  cc_test(
    custom_device_test
    SRCS custom/custom_device_test.cc
-    DEPS phi_backends phi_device_context gradient_accumulator)
+    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
  cc_test(
    capi_test
    SRCS custom/capi_test.cc

--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -15,8 +15,8 @@ endif()
 cc_library(
  scalar
  SRCS scalar.cc
-  DEPS phi_enforce tensor)
+  DEPS phi_enforce phi_tensor_utils)
 cc_library(
  int_array
  SRCS int_array.cc
-  DEPS phi_enforce tensor)
+  DEPS phi_enforce phi_tensor_utils)
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,8 +14,10 @@ limitations under the License. */

 #include "paddle/phi/common/int_array.h"

-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace paddle {
 namespace experimental {
@@ -28,7 +30,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
    AssignDataFromTensor(tensor);
  } else {
    phi::DenseTensor tensor_tmp;
-    paddle::framework::TensorCopySync(tensor, CPUPlace(), &tensor_tmp);
+    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+    auto dev_ctx = pool.Get(tensor.place());
+    phi::Copy(*dev_ctx, tensor, CPUPlace(), true, &tensor_tmp);
    AssignDataFromTensor(tensor_tmp);
  }
 }
@@ -45,8 +49,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
          array_.push_back(*tensor_list[i].template data<int32_t>());
        } else {
          phi::DenseTensor tensor_tmp;
-          paddle::framework::TensorCopySync(
-              tensor_list[i], CPUPlace(), &tensor_tmp);
+          phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+          auto dev_ctx = pool.Get(tensor_list[i].place());
+          phi::Copy(*dev_ctx, tensor_list[i], CPUPlace(), true, &tensor_tmp);
          array_.push_back(*tensor_tmp.template data<int32_t>());
        }
        break;
@@ -55,8 +60,9 @@ IntArrayBase<phi::DenseTensor>::IntArrayBase(
          array_.push_back(*tensor_list[i].template data<int64_t>());
        } else {
          phi::DenseTensor tensor_tmp;
-          paddle::framework::TensorCopySync(
-              tensor_list[i], CPUPlace(), &tensor_tmp);
+          phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+          auto dev_ctx = pool.Get(tensor_list[i].place());
+          phi::Copy(*dev_ctx, tensor_list[i], CPUPlace(), true, &tensor_tmp);
          array_.push_back(*tensor_tmp.template data<int64_t>());
        }
        break;

--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,9 +14,11 @@ limitations under the License. */

 #include "paddle/phi/common/scalar.h"

-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace experimental {

@@ -31,9 +33,11 @@ ScalarBase<phi::DenseTensor>::ScalarBase(const phi::DenseTensor& tensor_in)
                        "now Tensor has `%d` elements",
                        tensor_in.numel()));
  auto cpu_place = phi::CPUPlace();
-  if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) {
+  if (tensor_in.place().GetType() != phi::AllocationType::CPU) {
    phi::DenseTensor tensor;
-    framework::TensorCopySync(tensor_in, cpu_place, &tensor);
+    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+    auto dev_ctx = pool.Get(tensor_in.place());
+    phi::Copy(*dev_ctx, tensor_in, cpu_place, true, &tensor);
    GetDataFromTensor(tensor);
  } else {
    GetDataFromTensor(tensor_in);

--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -36,7 +36,7 @@ void Copy(const Context& dev_ctx,
  const auto& src_place = src.place();

  if (&src == dst) {
-    if (paddle::platform::is_same_place(src_place, dst_place)) {
+    if (src_place.GetType() == dst_place.GetType()) {
      VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place
              << " to " << dst_place;
    } else {
@@ -54,24 +54,24 @@ void Copy(const Context& dev_ctx,
  dst->Resize(src.dims());

  void* dst_ptr = nullptr;
-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 #ifdef PADDLE_WITH_MKLDNN
    dst->set_layout(src.layout());
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if (paddle::platform::is_gpu_place(dst_place) ||
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::GPU ||
+             dst_place.GetType() == AllocationType::GPUPINNED) {
    dst_ptr = dev_ctx.Alloc(
-        dst, src.dtype(), 0, paddle::platform::is_cuda_pinned_place(dst_place));
+        dst, src.dtype(), 0, dst_place.GetType() == AllocationType::GPUPINNED);
 #endif

 #ifdef PADDLE_WITH_XPU
-  } else if (paddle::platform::is_xpu_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::XPU) {
    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (paddle::platform::is_custom_place(dst_place)) {
+  } else if (dst_place.GetType() == AllocationType::CUSTOM) {
    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
 #endif
  }
@@ -98,22 +98,22 @@ void Copy(const Context& dev_ctx,
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
  CHECK(dst->layout() == src.layout());

-  if (paddle::platform::is_cpu_place(src_place) &&
-      paddle::platform::is_cpu_place(dst_place)) {
+  if (src_place.GetType() == AllocationType::CPU &&
+      dst_place.GetType() == AllocationType::CPU) {
    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if ((paddle::platform::is_cpu_place(src_place) ||
-              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
-             (paddle::platform::is_cpu_place(dst_place) ||
-              paddle::platform::is_cuda_pinned_place(dst_place))) {
+  } else if ((src_place.GetType() == AllocationType::CPU ||
+              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
+             (dst_place.GetType() == AllocationType::CPU ||
+              dst_place.GetType() == AllocationType::GPUPINNED)) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
        true,
        errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
@@ -131,14 +131,14 @@ void Copy(const Context& dev_ctx,
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if ((paddle::platform::is_cpu_place(src_place) ||
-              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
+  } else if ((src_place.GetType() == AllocationType::CPU ||
+              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPU) {
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
        true,
        errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
@@ -156,13 +156,13 @@ void Copy(const Context& dev_ctx,
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPU) {
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
        true,
        errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
@@ -170,20 +170,16 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    if (paddle::platform::is_same_place(src_place, dst_place)) {
+    if (src_place.GetType() == dst_place.GetType()) {
      paddle::memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
    } else {
-      if (paddle::platform::is_same_place(ctx_place, src_place)) {
+      if (ctx_place.GetType() == src_place.GetType()) {
        paddle::memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-        paddle::platform::DeviceContextPool::Instance()
-            .Get(src.place())
-            ->Wait();
-      } else if (paddle::platform::is_same_place(ctx_place, dst_place)) {
-        paddle::platform::DeviceContextPool::Instance()
-            .Get(src.place())
-            ->Wait();
+        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (ctx_place.GetType() == dst_place.GetType()) {
+        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
        paddle::memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
      } else {
@@ -191,13 +187,13 @@ void Copy(const Context& dev_ctx,
            "Context place dose not match the source and destination place."));
      }
    }
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::GPUPINNED) {
    auto src_gpu_place = src_place;
    auto dst_cuda_pinned_place = dst_place;
    auto ctx_place = dev_ctx.GetPlace();
    PADDLE_ENFORCE_EQ(
-        paddle::platform::is_gpu_place(ctx_place),
+        ctx_place.GetType() == AllocationType::GPU,
        true,
        errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
@@ -217,14 +213,14 @@ void Copy(const Context& dev_ctx,
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
 #endif
 #ifdef PADDLE_WITH_XPU
-  } else if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::XPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CPU &&
+             dst_place.GetType() == AllocationType::XPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_xpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::XPU &&
+             dst_place.GetType() == AllocationType::XPU) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
@@ -233,32 +229,26 @@ void Copy(const Context& dev_ctx,
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (paddle::platform::is_custom_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == AllocationType::CPU) {
    auto stream =
        blocking
            ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_custom_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CPU &&  // NOLINT
+             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_custom_place(src_place) &&  // NOLINT
-             paddle::platform::is_custom_place(dst_place)) {
+  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
+             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
-            : reinterpret_cast<const paddle::platform::CustomDeviceContext&>(
-                  dev_ctx)
-                  .stream();
+            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
 #endif
  } else {
@@ -435,11 +425,11 @@ void TensorFromVector(const std::vector<T>& src,
  auto dst_ptr = static_cast<void*>(dst->data<T>());
  auto size = src.size() * sizeof(T);

-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -450,7 +440,7 @@ void TensorFromVector(const std::vector<T>& src,
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -461,7 +451,7 @@ void TensorFromVector(const std::vector<T>& src,
  }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
@@ -490,11 +480,11 @@ void TensorFromVector(const std::vector<bool>& src,
  auto dst_ptr = ctx.template Alloc<bool>(dst);
  auto size = src.size() * sizeof(bool);

-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
-#ifdef PADDLE_WITH_CUDA
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -505,13 +495,13 @@ void TensorFromVector(const std::vector<bool>& src,
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
    auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
@@ -583,11 +573,11 @@ void TensorFromArray(const T* src,
  auto dst_ptr = static_cast<void*>(dst->data<T>());
  auto size = array_size * sizeof(T);

-  if (paddle::platform::is_cpu_place(dst_place)) {
+  if (dst_place.GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -598,7 +588,7 @@ void TensorFromArray(const T* src,
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -609,7 +599,7 @@ void TensorFromArray(const T* src,
  }
 #endif
 #ifdef PADDLE_WITH_XPU
-  else if (paddle::platform::is_xpu_place(dst_place)) {  // NOLINT
+  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
@@ -684,11 +674,11 @@ void TensorToVector(const phi::DenseTensor& src,
  dst->resize(src.numel());
  auto dst_ptr = static_cast<void*>(dst->data());

-  if (paddle::platform::is_cpu_place(src.place())) {
+  if (src.place().GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -699,12 +689,12 @@ void TensorToVector(const phi::DenseTensor& src,
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  else if (paddle::platform::is_xpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
    paddle::memory::Copy(
        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
@@ -728,11 +718,11 @@ void TensorToVector(const phi::DenseTensor& src,
  dst->resize(src.numel());
  auto dst_ptr = static_cast<void*>(array);

-  if (paddle::platform::is_cpu_place(src.place())) {
+  if (src.place().GetType() == AllocationType::CPU) {
    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  else if (paddle::platform::is_gpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
    paddle::memory::Copy(
        dst_place,
        dst_ptr,
@@ -743,12 +733,12 @@ void TensorToVector(const phi::DenseTensor& src,
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  else if (paddle::platform::is_xpu_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  else if (paddle::platform::is_custom_place(src.place())) {  // NOLINT
+  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
    paddle::memory::Copy(
        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
@@ -805,7 +795,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
  auto dst_ptr = static_cast<void*>(dst->data());

  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(src.place()),
+      src.place().GetType() == AllocationType::CPU,
      true,
      phi::errors::InvalidArgument(
          "The input tensor should be CPU device, but actually it is in %s.",
@@ -821,12 +811,12 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {

  bool* array = new bool[src.numel()];

-  paddle::platform::CPUPlace dst_place{};
+  phi::CPUPlace dst_place{};
  dst->resize(src.numel());
  auto dst_ptr = static_cast<void*>(array);

  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(src.place()),
+      src.place().GetType() == AllocationType::CPU,
      true,
      phi::errors::InvalidArgument(
          "The input tensor should be CPU device, but actually it is in %s.",
@@ -891,7 +881,7 @@ phi::DenseTensor ReshapeToMatrix(const phi::DenseTensor& src,
 template <typename T>
 T GetValue(const phi::DenseTensor* x) {
  T value = static_cast<T>(0);
-  if (!paddle::platform::is_cpu_place(x->place())) {
+  if (x->place().GetType() != AllocationType::CPU) {
    phi::DenseTensor cpu_x{};
    phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
    phi::DeviceContext* dev_ctx = pool.Get(x->place());

--- a/paddle/phi/kernels/cpu/amp_kernel.cc
+++ b/paddle/phi/kernels/cpu/amp_kernel.cc
@@ -24,8 +24,6 @@
 #include "paddle/phi/kernels/isfinite_kernel.h"
 #include "paddle/phi/kernels/reduce_all_kernel.h"

-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {

 // Utils

--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -163,7 +162,7 @@ void BatchNormGradRawKernel(const Context& ctx,
  }

  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
-    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    phi::Copy(ctx, *d_y, ctx.GetPlace(), false, d_x);
    return;
  }


--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/batch_norm_kernel.h"

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -106,7 +105,7 @@ void BatchNormKernel(const Context& ctx,
    if ((N * sample_size) == 1) {
      // Only 1 element in normalization dimension,
      // we skip the batch norm calculation, let y = x.
-      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      phi::Copy(ctx, x, ctx.GetPlace(), false, y);
      return;
    }


--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -16,7 +16,6 @@

 #include <vector>

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"

--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/cross_kernel.h"

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"

--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -572,7 +572,7 @@ static void Interpolate1DCPUFwd(
  dev_ctx.template Alloc<T>(output);

  if (in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
    return;
  }

@@ -702,7 +702,7 @@ static void Interpolate2DCPUFwd(
  dev_ctx.template Alloc<T>(output);

  if (in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
    return;
  }

@@ -897,7 +897,7 @@ static void Interpolate3DCPUFwd(
  dev_ctx.template Alloc<T>(output);

  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output);
    return;
  }


--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -23,7 +23,6 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memcpy.h"
 #endif


--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -14,7 +14,6 @@

 #pragma once

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -83,8 +82,10 @@ inline std::vector<int> get_new_shape(
    const std::vector<const DenseTensor*>& list_new_shape_tensor) {
  // get tensor from
  std::vector<int> vec_new_shape;
+  auto& pool = phi::DeviceContextPool::Instance();
  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
    auto tensor = list_new_shape_tensor[i];
+    phi::DeviceContext* dev_ctx = pool.Get(tensor->place());
    PADDLE_ENFORCE_EQ(tensor->dims() == phi::make_ddim({1}) ||
                          tensor->dims() == phi::make_ddim({}),
                      true,
@@ -96,15 +97,14 @@ inline std::vector<int> get_new_shape(
 #ifdef PADDLE_WITH_XPU
    if (tensor->place().GetType() == phi::AllocationType::XPU) {
      DenseTensor temp;
-      paddle::framework::TensorCopySync(*tensor, phi::CPUPlace(), &temp);
+      phi::Copy(*dev_ctx, *tensor, phi::CPUPlace(), true, &temp);
      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
      continue;
    }
 #endif
-    if (paddle::platform::is_gpu_place(tensor->place())) {
+    if (tensor->place().GetType() == phi::AllocationType::GPU) {
      DenseTensor temp;
-      paddle::framework::TensorCopySync(
-          *tensor, paddle::platform::CPUPlace(), &temp);
+      phi::Copy(*dev_ctx, *tensor, phi::CPUPlace(), true, &temp);
      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
    } else {
      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
@@ -120,22 +120,24 @@ inline std::vector<T> get_new_data_from_tensor(
  std::vector<T> vec_new_data;
  auto* new_data = new_data_tensor->data<T>();
  DenseTensor cpu_starts_tensor;
+  auto& pool = phi::DeviceContextPool::Instance();
+  phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place());
  if (paddle::platform::is_gpu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();
  }
 #ifdef PADDLE_WITH_ASCEND_CL
  if (paddle::platform::is_npu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();
  }
 #endif
 #ifdef PADDLE_WITH_XPU
  if (paddle::platform::is_xpu_place(new_data_tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor);
+    phi::Copy(
+        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();
  }
 #endif

--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -19,7 +19,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"

--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -19,7 +19,6 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memory.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -16,7 +16,6 @@

 #include <vector>

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -86,8 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
    ctx.template Alloc<T>(output_tensor);
    if (just_copy) {
      // Turns out to be a No-Op, simply copy tensors
-      paddle::framework::TensorCopy(
-          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+      phi::Copy(ctx, *input_tensor, ctx.GetPlace(), false, output_tensor);
    } else {
      // reduce_sum implementation on CUDA
      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(

--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;
 #include <iterator>
 #include <random>

-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"


--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -27,8 +27,8 @@
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/generator.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -103,7 +103,7 @@ struct OneHotGenerator<GPUContext, T> {
    DenseTensor input_tensor;
    input_tensor.Resize(out->dims());
    ctx.template Alloc<T>(&input_tensor);
-    paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
+    phi::Copy(ctx, *out, ctx.GetPlace(), false, &input_tensor);
    funcs::set_constant(ctx, out, 0.0);
    OneHotCUDAKernel<T, thread_size>
        <<<block_size, thread_size, 0, ctx.stream()>>>(

--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -693,8 +693,7 @@ static void Interpolate1DCUDAFwd(
    }
    if (out_size) {
      DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
      auto size_data = sizes.data<int>();
      out_w = size_data[0];
    }
@@ -714,7 +713,7 @@ static void Interpolate1DCUDAFwd(
  auto output_data = dev_ctx.template Alloc<T>(output);

  if (in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
    return;
  }

@@ -834,8 +833,8 @@ static void Interpolate2DCUDAFwd(
    }
    if (out_size) {
      DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
+
      auto size_data = sizes.data<int>();
      out_h = size_data[0];
      out_w = size_data[1];
@@ -862,7 +861,7 @@ static void Interpolate2DCUDAFwd(
  auto output_data = dev_ctx.template Alloc<T>(output);

  if (in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
    return;
  }

@@ -1110,8 +1109,7 @@ static void Interpolate3DCUDAFwd(
    }
    if (out_size) {
      DenseTensor sizes;
-      paddle::framework::TensorCopySync(
-          *out_size, paddle::platform::CPUPlace(), &sizes);
+      phi::Copy(dev_ctx, *out_size, phi::CPUPlace(), true, &sizes);
      auto size_data = sizes.data<int>();
      out_d = size_data[0];
      out_h = size_data[1];
@@ -1144,7 +1142,7 @@ static void Interpolate3DCUDAFwd(
  auto output_data = dev_ctx.template Alloc<T>(output);

  if (in_d == out_d && in_h == out_h && in_w == out_w) {
-    paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output);
+    phi::Copy(dev_ctx, input, dev_ctx.GetPlace(), false, output);
    return;
  }


--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -14,7 +14,6 @@

 #pragma once

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -58,8 +57,7 @@ void MeshgridForward(const Context& ctx,
    view_shape[i] = shape[i];

    DenseTensor reshape_ins_tensor;
-    paddle::framework::TensorCopy(
-        *ins[i], ctx.GetPlace(), ctx, &reshape_ins_tensor);
+    phi::Copy(ctx, *ins[i], ctx.GetPlace(), false, &reshape_ins_tensor);
    DDim out_dims_reshape = phi::make_ddim(view_shape);
    reshape_ins_tensor.Resize(out_dims_reshape);
    DDim out_dims = phi::make_ddim(shape);

--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -18,7 +18,6 @@

 #include <vector>

-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"