diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index 94fab974aced173b4101f6b8cfc0812f011f92a7..1f1b8ddd5f412d494572a2e9cf154c9c93095b8d 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
   }
 }
 
+void TensorCopy(const DenseTensor& src, DenseTensor* dst) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const paddle::platform::CUDADeviceContext* dev_ctx;
+  if (paddle::platform::is_gpu_place(dst->place()) ||
+      paddle::platform::is_npu_place(dst->place())) {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(dst->place()));
+
+  } else {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(src.place()));
+  }
+
+  pten::Copy(*dev_ctx, src, false, dst);
+}
+
 template <typename Tx,
           typename Ty,
           template <typename, typename> class ReduceOp>
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   if (config.reduce_num == 1) {
     auto out_dims = y->dims();
     if (x.dtype() == y->dtype()) {
-      pten::Copy(*dev_ctx, x, true, y);
+      TensorCopy(x, y);
       y->Resize(out_dims);
     } else {
       PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {