diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h index 94fab974aced173b4101f6b8cfc0812f011f92a7..1f1b8ddd5f412d494572a2e9cf154c9c93095b8d 100644 --- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h +++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h @@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data, } } +void TensorCopy(const DenseTensor& src, DenseTensor* dst) { + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + const paddle::platform::CUDADeviceContext* dev_ctx; + if (paddle::platform::is_gpu_place(dst->place()) || + paddle::platform::is_npu_place(dst->place())) { + dev_ctx = static_cast( + pool.Get(dst->place())); + + } else { + dev_ctx = static_cast( + pool.Get(src.place())); + } + + pten::Copy(*dev_ctx, src, false, dst); +} + template class ReduceOp> @@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x, if (config.reduce_num == 1) { auto out_dims = y->dims(); if (x.dtype() == y->dtype()) { - pten::Copy(*dev_ctx, x, true, y); + TensorCopy(x, y); y->Resize(out_dims); } else { PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {