[Phi]Change Copy from Kernel to basic component utils (#43622)

* perfect copy * deal with conflict * deal with conflict * fix compile bugs * fix unittest bugs * change code format * deal with conflict * modify code by review * fix ce bugs * fix ce bugs * add lo * perfect code format * deal with conflicts

[Phi]Change Copy from Kernel to basic component utils (#43622)
* perfect copy * deal with conflict * deal with conflict * fix compile bugs * fix unittest bugs * change code format * deal with conflict * modify code by review * fix ce bugs * fix ce bugs * add lo * perfect code format * deal with conflicts
2739bd73 · YuanRisheng · GitHub · 7985407b · 2739bd73 · 2739bd73
119 changed file
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -21,13 +21,6 @@
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"

-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
-#endif
-
 namespace eager_test {
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
 class AutogradMetaTest : public AbstractAutogradMeta {
@@ -212,7 +205,8 @@ TEST(EagerVariable, Constructor) {
 TEST(EagerVariable, DataLayout) {
  paddle::experimental::Tensor tensor;
  phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1, 1, 1}),
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           phi::make_ddim({1, 1, 1, 1}),
                           paddle::experimental::DataLayout::UNDEFINED);
  std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
      std::make_unique<paddle::experimental::DefaultAllocator>(

--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,7 +30,6 @@
 #include "paddle/phi/core/tensor_meta.h"

 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);

 namespace egr {
@@ -44,9 +43,12 @@ TEST(Backward, SingleNodeEmptyGrad) {

  // Create Target Tensor
  paddle::experimental::Tensor target_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);

  paddle::experimental::Tensor leaf_tensor;
  {
@@ -92,17 +94,24 @@ TEST(Backward, SingleNodeCustomGrad) {
  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});

  // Create Target Tensor
-  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor));

  std::vector<paddle::experimental::Tensor> grad_tensors;
  // Create Grad Tensor
  paddle::experimental::Tensor grad_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           10.0 /*value*/,
+                                           false /*is_leaf*/);
  grad_tensors.emplace_back(std::move(grad_tensor));

  paddle::experimental::Tensor leaf_tensor;
@@ -157,9 +166,13 @@ TEST(Backward, LinearNodes) {
  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});

  // Create Target Tensor
-  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor));

  paddle::experimental::Tensor leaf_tensor;
@@ -229,25 +242,39 @@ TEST(Backward, WithAccumulation) {

  // Create Target Tensor
  std::vector<paddle::experimental::Tensor> target_tensors;
-  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
-  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor0 =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor0));
  target_tensors.emplace_back(std::move(tensor1));

  // Create Grad Tensor
  std::vector<paddle::experimental::Tensor> grad_tensors;
  paddle::experimental::Tensor grad_tensor0 =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           5.0 /*value*/,
+                                           false /*is_leaf*/);
  paddle::experimental::Tensor grad_tensor1 =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           10.0 /*value*/,
+                                           false /*is_leaf*/);
  grad_tensors.emplace_back(std::move(grad_tensor0));
  grad_tensors.emplace_back(std::move(grad_tensor1));


--- a/paddle/fluid/eager/tests/task_tests/grad_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -29,7 +29,6 @@
 #include "paddle/phi/core/tensor_meta.h"

 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);

 namespace egr {
@@ -43,15 +42,21 @@ TEST(Grad, SingleNodeEmptyGrad) {

  // Create Target Tensor (output)
  paddle::experimental::Tensor output_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);

  // Create input tensor
  const paddle::experimental::Tensor leaf_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           true /*is_leaf*/);

  {
    // Create Scale Node
@@ -103,23 +108,33 @@ TEST(Grad, SingleNodeCustomGrad) {
  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});

  // Create Target Tensor
-  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor));

  std::vector<paddle::experimental::Tensor> grad_tensors;
  // Create Grad Tensor
  paddle::experimental::Tensor grad_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           10.0 /*value*/,
+                                           false /*is_leaf*/);
  grad_tensors.emplace_back(std::move(grad_tensor));

  paddle::experimental::Tensor leaf_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           true /*is_leaf*/);

  {
    // Create Scale Node
@@ -172,15 +187,22 @@ TEST(Grad, LinearNodes) {
  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});

  // Create Target Tensor
-  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor));

  paddle::experimental::Tensor leaf_tensor =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           true /*is_leaf*/);
  {
    // Create Node0
    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
@@ -247,25 +269,39 @@ TEST(Grad, WithAccumulation) {

  // Create Target Tensor
  std::vector<paddle::experimental::Tensor> target_tensors;
-  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
-  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
-      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor0 =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 =
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           1.0 /*value*/,
+                                           false /*is_leaf*/);
  target_tensors.emplace_back(std::move(tensor0));
  target_tensors.emplace_back(std::move(tensor1));

  // Create Grad Tensor
  std::vector<paddle::experimental::Tensor> grad_tensors;
  paddle::experimental::Tensor grad_tensor0 =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           5.0 /*value*/,
+                                           false /*is_leaf*/);
  paddle::experimental::Tensor grad_tensor1 =
-      egr_utils_api::CreateTensorWithValue(
-          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
-          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+      egr_utils_api::CreateTensorWithValue(ddim,
+                                           paddle::platform::CPUPlace(),
+                                           phi::DataType::FLOAT32,
+                                           phi::DataLayout::NCHW,
+                                           10.0 /*value*/,
+                                           false /*is_leaf*/);
  grad_tensors.emplace_back(std::move(grad_tensor0));
  grad_tensors.emplace_back(std::move(grad_tensor1));


--- a/paddle/fluid/jit/layer_test.cc
+++ b/paddle/fluid/jit/layer_test.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 #include "paddle/fluid/jit/layer.h"

--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 #include "paddle/phi/kernels/autotune/cache.h"
-#include "paddle/phi/kernels/copy_kernel.h"

 namespace paddle {
 namespace operators {
@@ -41,7 +41,9 @@ struct GreaterThan {

 // Value can be decided in compile time.
 template <typename FUN, int INT_32 = 32>
-constexpr bool CheckProperTileSize(int tile_long, int tile_short, int size_T,
+constexpr bool CheckProperTileSize(int tile_long,
+                                   int tile_short,
+                                   int size_T,
                                   FUN op) {
  return (size_T == 16 && ((tile_long == INT_32 && op(tile_short, 4)) ||
                           (tile_long == 2 * INT_32 && op(tile_short, 4)) ||
@@ -79,7 +81,8 @@ constexpr bool CheckNonLongTileSize(int tile_long, int tile_short, int size_T) {
 // Use SM to do data transfer, load a tile into SM then store out.
 // All tile read and write are colascing, so can speedup memory copy
 template <typename T, int NumThreads, int TileX, int TileY>
-__global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
+__global__ void TilingSwapDim1And2(const T* __restrict__ input,
+                                   Dim3 input_dims,
                                   T* __restrict__ output) {
  assert(blockDim.x == NumThreads);
  assert(blockDim.y == 1);
@@ -218,12 +221,14 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
 template <int TSIZE>
 bool SelectProperTileSize(std::vector<std::pair<int, int>>* tiles) {
  PADDLE_ENFORCE_LE(
-      TSIZE, 16,
+      TSIZE,
+      16,
      platform::errors::InvalidArgument(
          "The tile size should smaller than 16, but received is:%d.", TSIZE));

  PADDLE_ENFORCE_EQ(
-      (TSIZE & (TSIZE - 1)), 0,
+      (TSIZE & (TSIZE - 1)),
+      0,
      platform::errors::InvalidArgument(
          "Data types should be powers of 2, but reived size is:%d.", TSIZE));

@@ -269,29 +274,37 @@ struct SystemElemType<16> {
 };

 template <typename T, int tile_long, int tile_short>
-void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
-                                      int tile_size_j, int total_tiles_count,
-                                      const T* input, const Dim3& input_dims,
+void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d,
+                                      int tile_size_i,
+                                      int tile_size_j,
+                                      int total_tiles_count,
+                                      const T* input,
+                                      const Dim3& input_dims,
                                      T* output) {
  constexpr int NumThreads = tile_long;
  if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
    TilingSwapDim1And2<T, NumThreads, tile_long, tile_short>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(
+            input, input_dims, output);
  } else {
    TilingSwapDim1And2<T, NumThreads, tile_short, tile_long>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(
+            input, input_dims, output);
  }
 }

 template <typename T, int tile_long, int tile_short, typename dummy = void>
 struct NarrowDims2TransposeDispatch {
-  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
-                          int tile_size_j, int total_tiles_count,
-                          const T* input, const Dim3& input_dims, T* output) {
+  static void DoTranspose(const phi::GPUContext& d,
+                          int tile_size_i,
+                          int tile_size_j,
+                          int total_tiles_count,
+                          const T* input,
+                          const Dim3& input_dims,
+                          T* output) {
    PADDLE_ENFORCE_EQ(
-        (tile_long & (tile_long - 1)), 0,
+        (tile_long & (tile_long - 1)),
+        0,
        platform::errors::InvalidArgument(
            "The length of the longer side of the tile should be power of 2."
            " But received value is:%d.",
@@ -302,7 +315,12 @@ struct NarrowDims2TransposeDispatch {

    if (request_satisfied) {
      LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          d,
+          tile_size_i,
+          tile_size_j,
+          total_tiles_count,
+          input,
+          input_dims,
          output);
      return;
    }
@@ -312,11 +330,21 @@ struct NarrowDims2TransposeDispatch {

    if (long_side_request_not_satisfied) {
      NarrowDims2TransposeDispatch<T, tile_long * 2, tile_short>::DoTranspose(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          d,
+          tile_size_i,
+          tile_size_j,
+          total_tiles_count,
+          input,
+          input_dims,
          output);
    } else {
      NarrowDims2TransposeDispatch<T, tile_long, tile_short + 1>::DoTranspose(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          d,
+          tile_size_i,
+          tile_size_j,
+          total_tiles_count,
+          input,
+          input_dims,
          output);
    }
  }
@@ -325,14 +353,22 @@ struct NarrowDims2TransposeDispatch {
 // If Not long tile size, goto this function when compile.
 template <typename T, int tile_long, int tile_short>
 struct NarrowDims2TransposeDispatch<
-    T, tile_long, tile_short,
-    typename std::enable_if<
-        CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
-  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
-                          int tile_size_j, int total_tiles_count,
-                          const T* input, const Dim3& input_dims, T* output) {
+    T,
+    tile_long,
+    tile_short,
+    typename std::enable_if<CheckNonLongTileSize(
+                                tile_long, tile_short, sizeof(T)),
+                            void>::type> {
+  static void DoTranspose(const phi::GPUContext& d,
+                          int tile_size_i,
+                          int tile_size_j,
+                          int total_tiles_count,
+                          const T* input,
+                          const Dim3& input_dims,
+                          T* output) {
    PADDLE_ENFORCE_EQ(
-        (tile_long & (tile_long - 1)), 0,
+        (tile_long & (tile_long - 1)),
+        0,
        platform::errors::InvalidArgument(
            "The length of the longer side of the tile should be power of 2."
            " But received value is:%d.",
@@ -343,13 +379,23 @@ struct NarrowDims2TransposeDispatch<

    if (request_satisfied) {
      LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          d,
+          tile_size_i,
+          tile_size_j,
+          total_tiles_count,
+          input,
+          input_dims,
          output);
      return;
    }

    NarrowDims2TransposeDispatch<T, tile_long, tile_short + 1>::DoTranspose(
-        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        d,
+        tile_size_i,
+        tile_size_j,
+        total_tiles_count,
+        input,
+        input_dims,
        output);
  }
 };
@@ -357,34 +403,49 @@ struct NarrowDims2TransposeDispatch<
 // If long tile size, goto this function when compile.
 template <typename T, int tile_long, int tile_short>
 struct NarrowDims2TransposeDispatch<
-    T, tile_long, tile_short,
+    T,
+    tile_long,
+    tile_short,
    typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
                            void>::type> {
-  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
-                          int tile_size_j, int total_tiles_count,
-                          const T* input, const Dim3& input_dims, T* output) {
+  static void DoTranspose(const phi::GPUContext& d,
+                          int tile_size_i,
+                          int tile_size_j,
+                          int total_tiles_count,
+                          const T* input,
+                          const Dim3& input_dims,
+                          T* output) {
    PADDLE_ENFORCE_EQ(
-        (tile_long & (tile_long - 1)), 0,
+        (tile_long & (tile_long - 1)),
+        0,
        platform::errors::InvalidArgument(
            "The length of the longer side of the tile should be power of 2,"
            " but received is:%d.",
            tile_long));

    LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
-        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        d,
+        tile_size_i,
+        tile_size_j,
+        total_tiles_count,
+        input,
+        input_dims,
        output);
  }
 };

 template <typename T, bool conjugate = false>
-void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
-                          const Dim3& input_dims, T* output,
+void SwapDim1And2InNarrow(const phi::GPUContext& d,
+                          const T* input,
+                          const Dim3& input_dims,
+                          T* output,
                          const int kMinTileSize) {
  // First get available tile sizes for the data type requested as backups
  std::vector<std::pair<int, int>> tile_sele;
  auto ret = SelectProperTileSize<sizeof(T)>(&tile_sele);
  PADDLE_ENFORCE_EQ(
-      ret, true,
+      ret,
+      true,
      platform::errors::InvalidArgument(
          "SelectProperTileSize should return true, but return value is:%d.",
          ret));
@@ -451,16 +512,22 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
  using ElemType = typename SystemElemType<sizeof(T)>::type;

  NarrowDims2TransposeDispatch<ElemType, 32, 2>::DoTranspose(
-      d, select_tile_size_i, select_tile_size_j, total_tiles_count,
-      reinterpret_cast<const ElemType*>(input), input_dims,
+      d,
+      select_tile_size_i,
+      select_tile_size_j,
+      total_tiles_count,
+      reinterpret_cast<const ElemType*>(input),
+      input_dims,
      reinterpret_cast<ElemType*>(output));
 }

 // This is for case that cannot do coalescing read and write.
 // Or input is too small to split into tiles.
 template <typename T, int pos0, int pos1, int pos2>
-__global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
-                                      Dim3 input_dims, T* __restrict__ output) {
+__global__ void TransposeSimpleKernel(int nthreads,
+                                      const T* __restrict__ input,
+                                      Dim3 input_dims,
+                                      T* __restrict__ output) {
  Dim3 output_dims;
  output_dims[pos0] = input_dims[0];
  output_dims[pos1] = input_dims[1];
@@ -482,8 +549,10 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,

 // Here suppose convert all tensor to dim3, so just change dim1 and 2.
 template <typename T>
-void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
-                                 const Dim3& input_dims, T* output) {
+void SendSwapDim1And2InTranspose(const phi::GPUContext& d,
+                                 const T* input,
+                                 const Dim3& input_dims,
+                                 T* output) {
  // Suppose tile size > 16
  static const int kMinTileSize = 16;
  static const int kMinNarrowTileSize = 96;
@@ -508,8 +577,8 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
        input_dims_aligned[0] * input_dims_aligned[1] * input_dims_aligned[2];

    TilingSwapDim1And2<T, kNumThreads, kTileSize, kTileSize>
-        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
-                                                            output);
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(
+            input, input_dims, output);

  } else if (narrow_tile) {
    // If input shape is like Rect, such as 2X100, use Narrow tile size.
@@ -529,8 +598,10 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
 template <typename T>
 struct SwapDim1And2InTranspose {
  typedef phi::GPUContext Device;
-  void operator()(const Device& d, const T* in,
-                  const std::vector<int>& combined_dims, T* out) {
+  void operator()(const Device& d,
+                  const T* in,
+                  const std::vector<int>& combined_dims,
+                  T* out) {
    Dim3 input_dims = {static_cast<int>(combined_dims[0]),
                       static_cast<int>(combined_dims[1]),
                       static_cast<int>(combined_dims[2])};
@@ -541,8 +612,10 @@ struct SwapDim1And2InTranspose {
 template <typename T>
 struct SwapDim0And2InTranspose {
  typedef phi::GPUContext Device;
-  void operator()(const Device& d, const T* in,
-                  const std::vector<int>& combined_dims, T* out) {
+  void operator()(const Device& d,
+                  const T* in,
+                  const std::vector<int>& combined_dims,
+                  T* out) {
    Dim3 input_dims = {static_cast<int>(combined_dims[0]),
                       static_cast<int>(combined_dims[1]),
                       static_cast<int>(combined_dims[2])};
@@ -562,11 +635,13 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
                                 const std::vector<int>& perm,
                                 std::vector<int>* new_perm,
                                 framework::DDim* new_dims) {
-  PADDLE_ENFORCE_EQ(shape.size(), perm.size(),
+  PADDLE_ENFORCE_EQ(shape.size(),
+                    perm.size(),
                    platform::errors::InvalidArgument(
                        " shape should have the save dim with perm, but"
                        " received shape size is:%d, perm size is:%d.",
-                        shape.size(), perm.size()));
+                        shape.size(),
+                        perm.size()));

  std::vector<int> dim_vec;
  if (shape.size() == 1) {
@@ -614,8 +689,10 @@ inline void CombineTransposeDim3(const framework::DDim& shape,

 template <typename T>
 struct TransposeSimple {
-  static bool run(const phi::GPUContext& ctx, const Tensor& in,
-                  const std::vector<int32_t> perm, Tensor* out) {
+  static bool run(const phi::GPUContext& ctx,
+                  const Tensor& in,
+                  const std::vector<int32_t> perm,
+                  Tensor* out) {
    // First reduce the dimensions of the input tensor if possible.
    std::vector<int> new_perm;
    framework::DDim new_dims;
@@ -805,7 +882,8 @@ __global__ void VectorizedPermuteKernel(PermuteParams<Rank, IndexT> params,
 // A general kernel for normal case, only support vectorized write.
 template <typename T, typename IndexT, int VecSize, int Rank>
 __global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
-                                     const T* __restrict__ src, T* dst,
+                                     const T* __restrict__ src,
+                                     T* dst,
                                     const size_t main_cnt,
                                     const size_t tail_cnt,
                                     const size_t offset) {
@@ -859,10 +937,12 @@ __global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
 // A Gerneral permute method that drectly find the dst data
 // coordinate in the source data.
 template <typename T, typename IndexT, int VecSize, int Rank>
-inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
+inline void LaunchPermuteKernel(const phi::GPUContext& ctx,
+                                const IndexT count,
                                const PermuteType perm_type,
                                const std::vector<size_t>& dims,
-                                const std::vector<int>& perm, const T* src,
+                                const std::vector<int>& perm,
+                                const T* src,
                                T* dst) {
  size_t main_count = count / VecSize;
  auto params = PermuteParams<Rank, IndexT>(dims, perm);
@@ -871,14 +951,12 @@ inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
  if (perm_type == PermuteType::kNormalPermute) {
    size_t tail_count = count - main_count * VecSize;
    size_t offset = count - tail_count;
-    GeneralPermuteKernel<
-        T, IndexT, VecSize,
-        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+    GeneralPermuteKernel<T, IndexT, VecSize, Rank>
+        <<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
            params, src, dst, main_count, tail_count, offset);
  } else {
-    VectorizedPermuteKernel<
-        T, IndexT, VecSize,
-        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+    VectorizedPermuteKernel<T, IndexT, VecSize, Rank>
+        <<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
            params, main_count, src, dst);
  }
 }
@@ -889,11 +967,12 @@ inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
                                      const PermuteType perm_type,
                                      const std::vector<size_t>& dims,
                                      const std::vector<int>& perm,
-                                      const T* src, T* dst) {
+                                      const T* src,
+                                      T* dst) {
 #define CALL_DISPATCH_RANK(rank)                      \
  case rank: {                                        \
-    LaunchPermuteKernel<T, IndexT, VecSize, rank>(ctx, count, perm_type, dims, \
-                                                  perm, src, dst);             \
+    LaunchPermuteKernel<T, IndexT, VecSize, rank>(    \
+        ctx, count, perm_type, dims, perm, src, dst); \
    break;                                            \
  }

@@ -915,7 +994,9 @@ inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
 // https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
 template <typename T, typename IndexT, int VecSize>
 __global__ void BatchTransposeKernel(const T* __restrict__ src_data,
-                                     T* dst_data, IndexT rows, IndexT cols) {
+                                     T* dst_data,
+                                     IndexT rows,
+                                     IndexT cols) {
  using VecT = phi::AlignedVector<T, VecSize>;

  __shared__ VecT tile[kTileSize][kShareCol];
@@ -961,10 +1042,13 @@ __global__ void BatchTransposeKernel(const T* __restrict__ src_data,

 // With the byte limitation of shared_memory, the VecSize shall be restricted
 // for the type whose byte-size is less than 8.
-template <typename T, typename IndexT, int Size,
+template <typename T,
+          typename IndexT,
+          int Size,
          int VecSize = (sizeof(T) > 8 ? 1 : Size)>
 inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
-                                  const std::vector<size_t>& dims, const T* src,
+                                  const std::vector<size_t>& dims,
+                                  const T* src,
                                  T* dst) {
  auto rank = dims.size();
  IndexT num_batches = (rank == 2) ? 1 : dims[0];
@@ -976,9 +1060,8 @@ inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
  dim3 blocks(num_tile_cols, num_tile_rows, num_batches);
  dim3 threads(kTileSize, kBlockRows, 1);

-  BatchTransposeKernel<T, IndexT,
-                       VecSize><<<blocks, threads, 0, ctx.stream()>>>(
-      src, dst, rows, cols);
+  BatchTransposeKernel<T, IndexT, VecSize>
+      <<<blocks, threads, 0, ctx.stream()>>>(src, dst, rows, cols);
 }

 template <typename T, typename IndexT>
@@ -987,14 +1070,16 @@ inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,
                                      const PermuteType perm_type,
                                      const std::vector<size_t>& dims,
                                      const std::vector<int>& perm,
-                                      const T* src, T* dst, IndexT count) {
+                                      const T* src,
+                                      T* dst,
+                                      IndexT count) {
 #define CALL_DISPATCH_VEC_SIZE(vec_size)                               \
  case vec_size: {                                                     \
    if (perm_type == PermuteType::kTranspose) {                        \
      LaunchTransposeKernel<T, IndexT, vec_size>(ctx, dims, src, dst); \
    } else {                                                           \
-      LaunchPermuteRankDispatch<T, IndexT, vec_size>(ctx, count, perm_type, \
-                                                     dims, perm, src, dst); \
+      LaunchPermuteRankDispatch<T, IndexT, vec_size>(                  \
+          ctx, count, perm_type, dims, perm, src, dst);                \
    }                                                                  \
    break;                                                             \
  }
@@ -1014,45 +1099,64 @@ inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,

 template <typename T>
 inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx,
-                                    const size_t count, const int vec_size,
+                                    const size_t count,
+                                    const int vec_size,
                                    const PermuteType perm_type,
                                    const std::vector<size_t>& dims,
-                                    const std::vector<int>& perm, const T* src,
+                                    const std::vector<int>& perm,
+                                    const T* src,
                                    T* dst) {
  if (count < std::numeric_limits<uint32_t>::max()) {
-    LaunchWithDispatchVecSize<T, uint32_t>(ctx, vec_size, perm_type, dims, perm,
-                                           src, dst,
+    LaunchWithDispatchVecSize<T, uint32_t>(ctx,
+                                           vec_size,
+                                           perm_type,
+                                           dims,
+                                           perm,
+                                           src,
+                                           dst,
                                           static_cast<uint32_t>(count));
  } else {
    int64_t cnt = static_cast<int64_t>(count);
-    LaunchWithDispatchVecSize<T, int64_t>(ctx, vec_size, perm_type, dims, perm,
-                                          src, dst,
+    LaunchWithDispatchVecSize<T, int64_t>(ctx,
+                                          vec_size,
+                                          perm_type,
+                                          dims,
+                                          perm,
+                                          src,
+                                          dst,
                                          static_cast<int64_t>(count));
  }
 }

 template <typename DeviceContext, typename T>
-inline void SimplifyThenLaunch(const int rank, const DeviceContext& ctx,
-                               const Tensor& in, Tensor* out,
+inline void SimplifyThenLaunch(const int rank,
+                               const DeviceContext& ctx,
+                               const Tensor& in,
+                               Tensor* out,
                               const std::vector<int32_t>& perm) {
  int sm_count = ctx.GetSMCount();
  auto src_dims = phi::vectorize<size_t>(in.dims());
-  auto simplifier = DimsSimplifier<T>(sm_count, rank, perm, src_dims,
-                                      in.data<T>(), out->data<T>());
+  auto simplifier = DimsSimplifier<T>(
+      sm_count, rank, perm, src_dims, in.data<T>(), out->data<T>());

  if (simplifier.GetPermType() == PermuteType::kCopy) {
    // If perm is [0,1,2,3], then just operate a DtoD copy.
    phi::Copy(ctx, in, ctx.GetPlace(), false, out);
  } else {
-    LaunchWithDispatchIndex<T>(
-        ctx, simplifier.GetCount(), simplifier.GetVecSize(),
-        simplifier.GetPermType(), simplifier.GetDims(), simplifier.GetPerm(),
-        in.data<T>(), out->data<T>());
+    LaunchWithDispatchIndex<T>(ctx,
+                               simplifier.GetCount(),
+                               simplifier.GetVecSize(),
+                               simplifier.GetPermType(),
+                               simplifier.GetDims(),
+                               simplifier.GetPerm(),
+                               in.data<T>(),
+                               out->data<T>());
  }
 }

 template <typename T>
-size_t GetTransposeKey(const int rank, const Tensor& in,
+size_t GetTransposeKey(const int rank,
+                       const Tensor& in,
                       const std::vector<int32_t>& perm) {
  auto in_shape = phi::vectorize(in.dims());
  return phi::autotune::GetKey(
@@ -1060,15 +1164,19 @@ size_t GetTransposeKey(const int rank, const Tensor& in,
 }

 template <typename T>
-void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int rank,
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx,
+                              const int rank,
                              const Tensor& in,
-                              const std::vector<int32_t>& perm, Tensor* out) {
+                              const std::vector<int32_t>& perm,
+                              Tensor* out) {
  PADDLE_ENFORCE_LT(
-      rank, phi::DDim::kMaxRank,
+      rank,
+      phi::DDim::kMaxRank,
      platform::errors::OutOfRange(
          "The maximum dimension rank of "
          "tensor is expected to be less than %d, but here is %d.",
-          phi::DDim::kMaxRank, rank));
+          phi::DDim::kMaxRank,
+          rank));

  auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
  if (!ret) {

--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"

 #include "paddle/fluid/framework/tensor_util.h"

--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"

 namespace paddle {
@@ -31,10 +32,7 @@ void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst) {
      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();

-  VLOG(6) << "copy API kernel key: " << kernel_key;
-  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      "copy", kernel_key);
-  VLOG(6) << "copy API kernel: " << kernel;
+  VLOG(6) << "start copy. ";

  auto target_place = phi::TransToPhiPlace(kernel_key.backend());
  auto& pool = paddle::experimental::DeviceContextPool::Instance();
@@ -47,14 +45,9 @@ void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst) {
  phi::MetaTensor meta_out(kernel_out);
  phi::UnchangedInferMeta(*dense_x, &meta_out);

-  using kernel_signature = void (*)(const platform::DeviceContext&,
-                                    const phi::DenseTensor&,
-                                    phi::Place,
-                                    bool,
-                                    phi::DenseTensor*);
+  phi::Copy(*dev_ctx, *dense_x, place, blocking, kernel_out);

-  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
+  VLOG(6) << "copy finished. ";
 }

 }  // namespace experimental

--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -19,9 +19,11 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"

+#include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 // clang-format off

@@ -113,9 +115,15 @@ void Tensor::copy_(const Tensor &src,
    // Deep Copy AutoGrad info from src to self.
    *autograd_meta_ = *(src.autograd_meta_);
  }
-
+  kernel_key_set.backend_set =
+      kernel_key_set.backend_set |
+      BackendSet(phi::TransToPhiBackend(target_place));
  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-  auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto place = phi::TransToPhiPlace(kernel_key.backend());
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetMutable(
+      place.GetType() == target_place.GetType() ? target_place : place);
+
  Backend kernel_backend = Backend::UNDEFINED;
  DataLayout kernel_layout = DataLayout::UNDEFINED;
  DataType kernel_data_type = DataType::UNDEFINED;
@@ -135,45 +143,25 @@ void Tensor::copy_(const Tensor &src,
  }

  if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
-    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-        "copy", {kernel_backend, kernel_layout, kernel_data_type});
-    VLOG(6) << "copy API kernel key: " << kernel_key;
-    VLOG(6) << "copy API kernel: " << kernel;
-    using kernel_signature = void (*)(const platform::DeviceContext &,
-                                      const phi::DenseTensor &,
-                                      phi::Place,
-                                      bool,
-                                      phi::DenseTensor *);
    SetKernelOutput(kernel_backend, this);
    phi::MetaTensor meta_out(impl_.get());
    phi::UnchangedInferMeta(
        MakeMetaTensor(
            *(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
        &meta_out);
-    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-    (*kernel_fn)(*dev_ctx,
+    phi::Copy(*dev_ctx,
              (*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
              target_place,
              blocking,
              static_cast<phi::DenseTensor *>(impl_.get()));
  } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
-    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-        "copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
-    VLOG(6) << "copy API kernel key: " << kernel_key;
-    VLOG(6) << "copy API kernel: " << kernel;
-    using kernel_signature = void (*)(const platform::DeviceContext &,
-                                      const phi::SelectedRows &,
-                                      phi::Place,
-                                      bool,
-                                      phi::SelectedRows *);
    SetSelectedRowsKernelOutput(kernel_backend, this);
    phi::MetaTensor meta_out(impl_.get());
    phi::UnchangedInferMeta(
        MakeMetaTensor(
            *(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
        &meta_out);
-    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-    (*kernel_fn)(*dev_ctx,
+    phi::Copy(*dev_ctx,
              (*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
              target_place,
              blocking,

--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -81,3 +81,43 @@ if(WITH_MKLDNN)
  add_dependencies(dense_tensor mkldnn)
  add_dependencies(tensor_base mkldnn)
 endif()
+
+if(WITH_GPU)
+  nv_library(
+    phi_tensor_utils
+    SRCS tensor_utils.cc
+    DEPS cpu_context
+         gpu_context
+         dense_tensor
+         selected_rows
+         malloc
+         memcpy
+         device_context)
+elseif(WITH_ROCM)
+  hip_library(
+    phi_tensor_utils
+    SRCS tensor_utils.cc
+    DEPS cpu_context
+         gpu_context
+         dense_tensor
+         selected_rows
+         malloc
+         memcpy
+         device_context)
+elseif(WITH_XPU_KP)
+  xpu_library(
+    phi_tensor_utils
+    SRCS tensor_utils.cc
+    DEPS cpu_context
+         xpu_context
+         dense_tensor
+         selected_rows
+         malloc
+         memcpy
+         device_context)
+else()
+  cc_library(
+    phi_tensor_utils
+    SRCS tensor_utils.cc
+    DEPS cpu_context dense_tensor selected_rows malloc memcpy device_context)
+endif()
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
@@ -43,27 +43,46 @@ void Copy(const Context& dev_ctx,
  void* dst_ptr = nullptr;
  if (paddle::platform::is_cpu_place(dst_place)) {
    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
-  } else if (paddle::platform::is_cuda_pinned_place(dst_place)) {
-    // now we only can use mutable_data to Alloc pinned memory here,
-    // dev_ctx can not alloc pinned memory now
-    dst_ptr = dst->mutable_data(dst_place, src.dtype());
-  } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (paddle::platform::is_gpu_place(dst_place) ||
+             paddle::platform::is_cuda_pinned_place(dst_place)) {
    dst_ptr = dev_ctx.Alloc(
        dst, src.dtype(), 0, paddle::platform::is_cuda_pinned_place(dst_place));
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  } else if (paddle::platform::is_xpu_place(dst_place)) {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+#endif
+  }
+
+  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
+  if (UNLIKELY(size) == 0) {
+    return;
  }

+  PADDLE_ENFORCE_EQ(
+      dst->place(),
+      dst_place,
+      phi::errors::Unavailable(
+          "The Dst Tensor's place and dst_place do not match, Tensor's place "
+          "place is %s, dst_place is %s.",
+          dst->place(),
+          dst_place));
+
  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
            << dst_place;
    return;
  }
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
-
  CHECK(dst->layout() == src.layout());

-  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
-
-  if ((paddle::platform::is_cpu_place(src_place) ||
+  if (paddle::platform::is_cpu_place(src_place) &&
+      paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if ((paddle::platform::is_cpu_place(src_place) ||
              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
             (paddle::platform::is_cpu_place(dst_place) ||
              paddle::platform::is_cuda_pinned_place(dst_place))) {
@@ -176,13 +195,87 @@ void Copy(const Context& dev_ctx,
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+#endif
+  }
+#ifdef PADDLE_WITH_XPU
+  else if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
+           paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  } else if (paddle::platform::is_cpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  } else if (paddle::platform::is_xpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "Place type error. Please check the place of src and dst Tensor."));
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
+#endif
 }

-}  // namespace phi
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst) {
+  if (src.value().Holder() != dst->value().Holder() ||
+      src.value().data() != dst->value().data()) {
+    dst->set_rows(src.rows());
+    dst->set_height(src.height());
+  }
+  Copy<Context>(
+      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
+}
+
+template void Copy(const CPUContext& dev_ctx,
+                   const DenseTensor& src,
+                   Place dst_place,
+                   bool blocking,
+                   DenseTensor* dst);
+
+template void Copy(const DeviceContext& dev_ctx,
+                   const DenseTensor& src,
+                   Place dst_place,
+                   bool blocking,
+                   DenseTensor* dst);
+
+template void Copy(const CPUContext& dev_ctx,
+                   const SelectedRows& src,
+                   Place dst_place,
+                   bool blocking,
+                   SelectedRows* dst);
+template void Copy(const DeviceContext& dev_ctx,
+                   const SelectedRows& src,
+                   Place dst_place,
+                   bool blocking,
+                   SelectedRows* dst);

-PD_REGISTER_GENERAL_KERNEL(
-    copy, GPU, ALL_LAYOUT, phi::Copy<phi::GPUContext>, ALL_DTYPE) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template void Copy(const GPUContext& dev_ctx,
+                   const DenseTensor& src,
+                   Place dst_place,
+                   bool blocking,
+                   DenseTensor* dst);
+template void Copy(const GPUContext& dev_ctx,
+                   const SelectedRows& src,
+                   Place dst_place,
+                   bool blocking,
+                   SelectedRows* dst);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template void Copy(const XPUContext& dev_ctx,
+                   const DenseTensor& src,
+                   Place dst_place,
+                   bool blocking,
+                   DenseTensor* dst);
+#endif
+
+}  // namespace phi
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/tensor_meta.h"

 namespace phi {
@@ -70,4 +71,18 @@ class DenseTensorUtils {
  }
 };

+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const DenseTensor& src,
+          Place dst_place,
+          bool blocking,
+          DenseTensor* dst);
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst);
+
 }  // namespace phi
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -29,7 +29,8 @@ set(COMMON_KERNEL_DEPS
    arg_map_context
    convert_utils
    lod_utils
-    custom_kernel)
+    custom_kernel
+    phi_tensor_utils)
 set(COMMON_KERNEL_DEPS
    ${COMMON_KERNEL_DEPS}
    eigen_function

--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/utils/optional.h"

 namespace phi {

--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -20,8 +20,8 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"

 namespace tune = phi::autotune;

--- a/paddle/phi/kernels/copy_kernel.h
+++ b/paddle/phi/kernels/copy_kernel.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename Context>
-void Copy(const Context& dev_ctx,
-          const DenseTensor& src,
-          Place dst_place,
-          bool blocking,
-          DenseTensor* dst);
-}  // namespace phi
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"

 DECLARE_int32(inner_op_parallelism);

--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/copy_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace phi {
-
-// NOTE(chenweihang): blocking is useless in cpu kernel
-template <typename Context>
-void Copy(const Context& dev_ctx,
-          const DenseTensor& src,
-          Place dst_place,
-          bool blocking,
-          DenseTensor* dst) {
-  auto* src_ptr = src.data();
-  const auto& src_place = src.place();
-
-  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << src_place;
-
-  dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
-
-  if (src_ptr == dst_ptr) {
-    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-            << src_place;
-    return;
-  }
-  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
-  CHECK(dst->layout() == src.layout());
-
-  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
-
-  if (paddle::platform::is_cpu_place(src_place)) {
-    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_GENERAL_KERNEL(
-    copy, CPU, ALL_LAYOUT, phi::Copy<phi::CPUContext>, ALL_DTYPE) {}
--- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
@@ -16,8 +16,8 @@ limitations under the License. */

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"


--- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/softmax_kernel.h"

--- a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"

--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"

--- a/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"

--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -15,7 +15,7 @@
 #pragma once

 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"


--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -22,7 +22,7 @@
 #endif
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"

--- a/paddle/phi/kernels/cpu/mode_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/mode.h"

 namespace phi {

--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"

--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/rnn_functor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/rnn_functor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"

--- a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/scatter.h"


--- a/paddle/phi/kernels/cpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/scatter.h"

 namespace phi {

--- a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.h"

 namespace phi {

--- a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/scatter.h"

 namespace phi {

--- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -21,7 +21,7 @@

 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"

--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -16,8 +16,8 @@

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"

 namespace phi {

--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -35,7 +35,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/funcs/strided_slice.h
+++ b/paddle/phi/kernels/funcs/strided_slice.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"


--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"


--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/range_function.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -28,8 +28,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -28,8 +28,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"

--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once

 #include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"

--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"

--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"

--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"

--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/logspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"


--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/psroi_pool_kernel.h"


--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"


--- a/paddle/phi/kernels/gpu/scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/scatter_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -22,7 +22,7 @@
 #include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/gpu/reduce.h"


--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"


--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -28,7 +28,7 @@
 #include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"

--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"

--- a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -16,7 +16,6 @@

 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"

--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"

--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"

--- a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"

 namespace phi {

--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"

--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -16,7 +16,7 @@

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/meshgrid_kernel.h"

--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -16,7 +16,7 @@

 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"

--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

--- a/paddle/phi/kernels/impl/size_kernel_impl.h
+++ b/paddle/phi/kernels/impl/size_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {
 template <typename T, typename Context>

--- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"

 namespace phi {

--- a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 #include <type_traits>
 #include <vector>

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/tile_grad_kernel.h"

--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"

--- a/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {
 template <typename T, typename Context>

--- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"

 namespace phi {

--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/operators/math/sequence_scale.h"
 #include "paddle/phi/backends/dynload/warpctc.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/utils/optional.h"

--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -16,8 +16,8 @@

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"

 namespace phi {

--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -16,7 +16,7 @@

 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/selected_rows/copy_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-namespace phi {
-namespace sr {
-
-template <typename Context>
-void Copy(const Context& dev_ctx,
-          const SelectedRows& src,
-          Place dst_place,
-          bool blocking,
-          SelectedRows* dst) {
-  if (src.value().Holder() != dst->value().Holder() ||
-      src.value().data() != dst->value().data()) {
-    dst->set_rows(src.rows());
-    dst->set_height(src.height());
-  }
-  phi::Copy<Context>(
-      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
-}
-
-}  // namespace sr
-}  // namespace phi
-
-PD_REGISTER_GENERAL_KERNEL(
-    copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(
-    copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
-#endif
--- a/paddle/phi/kernels/selected_rows/copy_kernel.h
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/selected_rows.h"
-#include "paddle/phi/core/sparse_csr_tensor.h"
-
-namespace phi {
-namespace sr {
-
-template <typename Context>
-void Copy(const Context& dev_ctx,
-          const SelectedRows& src,
-          Place dst_place,
-          bool blocking,
-          SelectedRows* dst);
-
-}  // namespace sr
-}  // namespace phi
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"

 namespace phi {

--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"


--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"


--- a/paddle/phi/kernels/sparse/copy_kernel.cc
+++ b/paddle/phi/kernels/sparse/copy_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {
 namespace sparse {

--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"

 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"

--- a/paddle/phi/kernels/sparse/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/full_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"


--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"

--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"

 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"

--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {
 namespace sparse {

--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"

--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"

--- a/paddle/phi/kernels/sparse/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/full_kernel.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"

 namespace phi {

--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"

--- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"

--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"

--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"

--- a/paddle/phi/kernels/sparse/unary_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
@@ -19,8 +19,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/activation_grad_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"

 #define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \

--- a/paddle/phi/kernels/sparse/unary_kernel.cc
+++ b/paddle/phi/kernels/sparse/unary_kernel.cc
@@ -19,8 +19,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"

 #define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                      \

--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/strings/gpu/copy_utils.h"


--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/copy_kernel.h"
-
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace phi {
-
-template <typename Context>
-void Copy(const Context& dev_ctx,
-          const DenseTensor& src,
-          Place dst_place,
-          bool blocking,
-          DenseTensor* dst) {
-  auto* src_ptr = src.data();
-  void* dst_ptr = nullptr;
-
-  dst->Resize(src.dims());
-  if (paddle::platform::is_cpu_place(dst_place)) {
-    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
-  } else {
-    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
-  }
-  const auto& src_place = src.place();
-
-  if (src_ptr == dst_ptr && src_place == dst_place) {
-    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-            << dst_place;
-    return;
-  }
-  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
-
-  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
-
-  CHECK(dst->layout() == src.layout());
-  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
-
-  if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
-      paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_xpu_place(src_place) &&
-             paddle::platform::is_xpu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_GENERAL_KERNEL(
-    copy, XPU, ALL_LAYOUT, phi::Copy<phi::XPUContext>, ALL_DTYPE) {}
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -29,7 +29,6 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif

 namespace paddle {

--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"

 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);

 namespace paddle {
 namespace tests {

--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"

--- a/paddle/phi/tests/api/test_pten_tensor.cc
+++ b/paddle/phi/tests/api/test_pten_tensor.cc
@@ -17,12 +17,6 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/kernel_registry.h"

-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-#endif
-
 namespace paddle {
 namespace tests {


--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);

 namespace paddle {
 namespace tests {

--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -21,11 +21,6 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"

-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-#endif
-
 namespace paddle {
 namespace tests {


--- a/paddle/phi/tests/common/test_int_array.cc
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -22,10 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/full_kernel.h"

 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif

 namespace phi {

--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -25,8 +25,6 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"

-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-
 namespace phi {
 namespace tests {


--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"

 namespace phi {
 namespace tests {

--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -23,16 +23,6 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/flatten_kernel.h"

-PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
-#endif
-
 namespace phi {
 namespace tests {


--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"


--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"


--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"

 namespace phi {