[Pten] Refactor the copy kernel (#39731)

* remove SetAllocationForOutputTenosr * add place param for copy kernel * recover SetAllocationForOutputTenosr * polish code * fix empty_dev api bug * test=allcases * test=allcases * fix bug * recover empty * recover modify

[Pten] Refactor the copy kernel (#39731)
* remove SetAllocationForOutputTenosr * add place param for copy kernel * recover SetAllocationForOutputTenosr * polish code * fix empty_dev api bug * test=allcases * test=allcases * fix bug * recover empty * recover modify
9a7b9eda · zyfncg · GitHub · 581b2c64 · 9a7b9eda · 9a7b9eda
19 changed file
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -32,39 +32,33 @@ namespace paddle {
 namespace experimental {

 Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
-  // 1. Get kernel signature and kernel
  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "copy", kernel_key);

-  VLOG(0) << "to API kernel key: " << kernel_key;
-  VLOG(0) << "to API kernel: " << kernel;
+  VLOG(6) << "copy API kernel key: " << kernel_key;
+  VLOG(6) << "copy API kernel: " << kernel;

-  // 2. Get Device Context
  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x.get());
-  kernel_context.EmplaceBackAttr(blocking);
-
-  // 4. Prepare outputs & InferMeta
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPtenPlace(backend)),
-      phi::DenseTensorMeta());
-  phi::MetaTensor meta_out(dense_out.get());
-  phi::UnchangedInferMeta(*dense_x, &meta_out);
-  dense_out->mutable_data(phi::TransToPtenPlace(backend));
-  kernel_context.EmplaceBackOutput(dense_out.get());
+
+  auto dense_x = TensorToDenseTensor(x);
+
  Tensor out;
-  out.set_impl(dense_out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  phi::MetaTensor meta_out(kernel_out);
+  phi::UnchangedInferMeta(*dense_x, &meta_out);

-  // 5. Call kernel
-  kernel(&kernel_context);
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    phi::Place,
+                                    bool,
+                                    phi::DenseTensor*);
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(
+      *dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out);

  return out;
 }

--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -245,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);

--- a/paddle/phi/kernels/copy_kernel.h
+++ b/paddle/phi/kernels/copy_kernel.h
@@ -22,6 +22,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
          const DenseTensor& src,
+          Place dst_place,
          bool blocking,
          DenseTensor* dst);
 }  // namespace phi
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -28,6 +28,7 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
          const DenseTensor& src,
+          Place dst_place,
          bool blocking,
          DenseTensor* dst) {
  auto* src_ptr = src.data();

--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx,
                       DenseTensor* x_grad) {
  auto xshape_dims = xshape.dims();
  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
-  x_grad->ResizeAndAllocate(x_dims);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+  x_grad->Resize(x_dims);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx,
                   int stop_axis,
                   DenseTensor* out) {
  auto out_dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
-  out->ResizeAndAllocate(out_dims);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
 }

 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate

--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -28,11 +28,11 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
          const DenseTensor& src,
+          Place dst_place,
          bool blocking,
          DenseTensor* dst) {
  auto* src_ptr = src.data();
  const auto& src_place = src.place();
-  auto dst_place = dst->place();

  if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
    PADDLE_THROW(phi::errors::InvalidArgument(
@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx,
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;

-  dst->ResizeAndAllocate(src.dims());
-  auto* dst_ptr = dst->mutable_data(dst_place);
+  dst->Resize(src.dims());
+
+  void* dst_ptr = nullptr;
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx,

  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());

-  if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-      paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cpu_place(dst_place)) {
+  if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+      paddle::platform::is_cpu_place(dst_place)) {
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
    auto ctx_place = dev_ctx.GetPlace();
@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx,
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-             paddle::platform::is_cuda_pinned_place(dst_place)) {
-    auto src_gpu_place = src_place;
-    auto dst_cuda_pinned_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from GPU memory to CUDA Pinned memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(src_gpu_place,
-                      ctx_gpu_place,
-                      phi::errors::PreconditionNotMet(
-                          "The source GPU device and current device context do "
-                          "not match. The source GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          src_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
-             paddle::platform::is_gpu_place(dst_place)) {
-    auto src_cuda_pinned_place = src_place;
-    auto dst_gpu_place = dst_place;
-    auto ctx_place = dev_ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
-                      true,
-                      phi::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from CUDA Pinned memory to GPU memory, current "
-                          "device context place should be GPU."));
-    auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_gpu_place,
-                      ctx_gpu_place,
-                      phi::errors::PreconditionNotMet(
-                          "The target GPU device and current device context do "
-                          "not match. The target GPU device number is %d, but "
-                          "device context GPU number is %d.",
-                          dst_gpu_place.device,
-                          ctx_gpu_place.device));
-    auto stream =
-        blocking ? nullptr
-                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
-        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
             paddle::platform::is_gpu_place(dst_place)) {
    auto src_gpu_place = src_place;

--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
    if (dx->dims() == dout.dims()) {
      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
      }
    } else {
      // For inplace strategy, dx will be stored in addr of dout, which makes
@@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
    if (dy->dims() == dout.dims()) {
      if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, false, dy);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
      }
    } else {
      std::vector<int> reduce_dims =
@@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx,
  if (dx_data == dout_data && dy_data != dout_data) {
    VLOG(4) << "Special case when dx_data is the same as dout_data, "
               "only need copy dout to dy";
-    phi::Copy(ctx, dout, false, dy);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
  } else if (dx_data != dout_data && dy_data == dout_data) {
    VLOG(4) << "Special case when dy_data is the same as dout_data, "
               "only need copy dout to dx";
-    phi::Copy(ctx, dout, false, dx);
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
  } else if (dx_data != dout_data && dy_data != dout_data) {
    auto size = x.numel();
    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
@@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
    if (dx->dims() == dout.dims()) {
      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, false, dx);
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
      }
    } else {
      // For inplace strategy, dx will be stored in addr of dout, which makes

--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
                DenseTensor* out) {
  out->Resize(phi::make_ddim(shape.GetData()));
  int numel = out->numel();
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
  if (numel > 0) {
    // in transformer model the numel of outpout will be zero.
    std::vector<const DenseTensor*> inputs = {};
@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx,
          static_cast<float>(value)));
  std::vector<const DenseTensor*> inputs = {};
  std::vector<DenseTensor*> outputs = {out};
-  out->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(out);
  // This function has no input, so the inputs.size() == 0. Use kUnary, but the
  // data will not be loaded in the kernel because the number of parameters in
  // the operator is 0

--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx,
      x_grad->dims() == out_grad.dims()) {
    VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
               "reduce";
-    phi::Copy(dev_ctx, out_grad, false, x_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
  } else if (x_grad == nullptr && y_grad != nullptr &&
             y_grad->dims() == out_grad.dims()) {
    VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
               "reduce";
-    phi::Copy(dev_ctx, out_grad, false, y_grad);
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad);
  } else {
    grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
  }

--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx,
  }
  // no need reduce, just copy
  if (just_copy) {
-    phi::Copy(ctx, out_grad, false, in_grad);
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
  } else {
    PADDLE_ENFORCE_GE(dims,
                      1,

--- a/paddle/phi/kernels/impl/size_kernel_impl.h
+++ b/paddle/phi/kernels/impl/size_kernel_impl.h
@@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx,
    cpu_tensor.Resize(out->dims());
    auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
    cpu_data[0] = input.numel();
-    phi::Copy(ctx, cpu_tensor, false, out);
+    phi::Copy(ctx, cpu_tensor, place, false, out);
  }
 }


--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx,
                       const DenseTensor& out_grad,
                       DenseTensor* x_grad) {
  auto x_dims = x_grad->dims();
-  phi::Copy(dev_ctx, out_grad, false, x_grad);
+  phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
  x_grad->Resize(x_dims);
 }


--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx,
  // TODO(chenweihang): the output dims are overwrite after copying,
  // here we need to use copy method that only copy data
  auto dims = out->dims();
-  phi::Copy(dev_ctx, x, false, out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
  out->Resize(dims);
  out->ResetLoD(x.lod());
 }

--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -27,12 +27,19 @@ namespace phi {
 template <typename Context>
 void Copy(const Context& dev_ctx,
          const DenseTensor& src,
+          Place dst_place,
          bool blocking,
          DenseTensor* dst) {
  auto* src_ptr = src.data();
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  void* dst_ptr = nullptr;
+
+  dst->Resize(src.dims());
+  if (paddle::platform::is_cpu_place(dst_place)) {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  }
  const auto& src_place = src.place();
-  const auto& dst_place = dst->place();

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx,

  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
-  dst->ResizeAndAllocate(src.dims());
+
  CHECK(dst->layout() == src.layout());
  auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());


--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -127,8 +127,8 @@ TEST(API, matmul_cuda) {
  auto place = paddle::platform::CUDAPlace();
  auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place));

-  phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
-  phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
+  phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get());
+  phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get());

  paddle::experimental::Tensor x(dense_x);
  paddle::experimental::Tensor y(dense_y);
@@ -152,7 +152,7 @@ TEST(API, matmul_cuda) {
      phi::DenseTensorMeta(
          phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW));

-  phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get());
+  phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get());

  for (size_t i = 0; i < 9; i++) {
    ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);

--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -62,7 +62,8 @@ TEST(DEV_API, copy) {
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
  dev_ctx.Init();
-  phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
+  phi::Copy(
+      dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());

  // 3. check result
  for (int64_t i = 0; i < dense_src->numel(); i++) {

--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -39,7 +39,7 @@ TEST(DEV_API, empty) {
  dev_ctx.Init();

  // 2. test API
-  auto out = phi::Empty<float>(dev_ctx, {3, 2}, phi::DataType::INT32);
+  auto out = phi::Empty<int>(dev_ctx, {3, 2}, phi::DataType::INT32);

  // 3. check result
  ASSERT_EQ(out.dims().size(), 2);

--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -53,8 +53,8 @@ inline void CheckResult(
                         DenseTensorMeta(real_elements.dtype(),
                                         real_elements.dims(),
                                         real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_indices, true, &indices);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);

    int cmp_indices = memcmp(indices.data<IndicesT>(),
                             non_zero_indices.data(),
@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
      cuda_alloc.get(),
      DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));

-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
  auto sparse_out =
      sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
  CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
  phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
  phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
  phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values);
  phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
  CheckResult<T, int64_t>(&dev_ctx_gpu,
@@ -406,9 +406,9 @@ inline void CheckCsrResult(
                         DenseTensorMeta(real_elements.dtype(),
                                         real_elements.dims(),
                                         real_elements.layout()));
-    phi::Copy(*dev_ctx_gpu, real_crows, true, &crows);
-    phi::Copy(*dev_ctx_gpu, real_cols, true, &cols);
-    phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
+    phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows);
+    phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols);
+    phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);

    int cmp_crows = memcmp(crows.data<IndicesT>(),
                           non_zero_crows.data(),
@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims,
  dev_ctx_gpu.PartialInitWithAllocator();
  phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta);
  phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, indices, true, &d_indices);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
  phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims);
  auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo);
  CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
          .GetAllocator(phi::CPUPlace())
          .get());
  dev_ctx_gpu.PartialInitWithAllocator();
-  phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
  auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x);

  CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims,
  dev_ctx_gpu.PartialInitWithAllocator();
  DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta());
  DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta());
-  phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices);
-  phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements);
+  phi::Copy(
+      dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices);
+  phi::Copy(
+      dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements);
  SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims);
  auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda);

@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims,
                          DenseTensorMeta(dense_out_cuda.dtype(),
                                          dense_out_cuda.dims(),
                                          dense_out_cuda.layout()));
-  phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out);
+  phi::Copy(
+      dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out);
  int cmp_cuda = memcmp(
      &dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size());
  ASSERT_EQ(cmp_cuda, 0);
@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims,
  phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
  phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
  phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
-  phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
-  phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
-  phi::Copy(dev_ctx_gpu, values, true, &d_values);
+  phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows);
+  phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols);
+  phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
  phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
  auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr);
  phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta());
-  phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out);
+  phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out);
  int cmp_cuda =
      memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size());
  ASSERT_EQ(cmp_cuda, 0);