From 9a7b9eda579ec673204af5ba350d0d5b5104c28c Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 26 Feb 2022 20:09:38 +0800 Subject: [PATCH] [Pten] Refactor the copy kernel (#39731) * remove SetAllocationForOutputTenosr * add place param for copy kernel * recover SetAllocationForOutputTenosr * polish code * fix empty_dev api bug * test=allcases * test=allcases * fix bug * recover empty * recover modify --- paddle/phi/api/lib/api_custom_impl.cc | 40 +++++----- paddle/phi/core/kernel_utils.h | 1 + paddle/phi/kernels/copy_kernel.h | 1 + paddle/phi/kernels/cpu/copy_kernel.cc | 1 + paddle/phi/kernels/flatten_grad_kernel.cc | 4 +- paddle/phi/kernels/flatten_kernel.cc | 4 +- paddle/phi/kernels/gpu/copy_kernel.cu | 75 +++---------------- paddle/phi/kernels/gpu/elementwise.h | 10 +-- paddle/phi/kernels/gpu/full_kernel.cu | 4 +- .../impl/elementwise_grad_kernel_impl.h | 4 +- .../kernels/impl/expand_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/size_kernel_impl.h | 2 +- paddle/phi/kernels/reshape_grad_kernel.cc | 2 +- paddle/phi/kernels/reshape_kernel.cc | 2 +- paddle/phi/kernels/xpu/copy_kernel.cc | 13 +++- paddle/phi/tests/api/test_matmul_api.cc | 6 +- paddle/phi/tests/kernels/test_copy_dev_api.cc | 3 +- .../tests/kernels/test_creation_dev_api.cc | 2 +- .../kernels/test_sparse_utils_dev_api.cc | 41 +++++----- 19 files changed, 86 insertions(+), 131 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 66dba2cc2e..67b7430167 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -32,39 +32,33 @@ namespace paddle { namespace experimental { Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { - // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( "copy", kernel_key); - VLOG(0) << "to API kernel key: " << kernel_key; - VLOG(0) << "to API kernel: " << kernel; + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; - // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = phi::KernelContext(dev_ctx); - - // 3. Auto data transform - auto dense_x = std::dynamic_pointer_cast(x.impl()); - kernel_context.EmplaceBackInput(dense_x.get()); - kernel_context.EmplaceBackAttr(blocking); - - // 4. Prepare outputs & InferMeta - auto dense_out = std::make_shared( - phi::make_intrusive( - phi::TransToPtenPlace(backend)), - phi::DenseTensorMeta()); - phi::MetaTensor meta_out(dense_out.get()); - phi::UnchangedInferMeta(*dense_x, &meta_out); - dense_out->mutable_data(phi::TransToPtenPlace(backend)); - kernel_context.EmplaceBackOutput(dense_out.get()); + + auto dense_x = TensorToDenseTensor(x); + Tensor out; - out.set_impl(dense_out); + auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); - // 5. Call kernel - kernel(&kernel_context); + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)( + *dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out); return out; } diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 862f61b204..2fda3cb6db 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -245,6 +245,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); + PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); diff --git a/paddle/phi/kernels/copy_kernel.h b/paddle/phi/kernels/copy_kernel.h index a3ba6eabcd..95df29f7e6 100644 --- a/paddle/phi/kernels/copy_kernel.h +++ b/paddle/phi/kernels/copy_kernel.h @@ -22,6 +22,7 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst); } // namespace phi diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc index 7dcd75d39e..1af071f23d 100644 --- a/paddle/phi/kernels/cpu/copy_kernel.cc +++ b/paddle/phi/kernels/cpu/copy_kernel.cc @@ -28,6 +28,7 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 7e8010a43f..f6ba272500 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { auto xshape_dims = xshape.dims(); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - phi::Copy(dev_ctx, out_grad, false, x_grad); - x_grad->ResizeAndAllocate(x_dims); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + x_grad->Resize(x_dims); } } // namespace phi diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 12eaab92d5..78ac9eaa78 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx, int stop_axis, DenseTensor* out) { auto out_dims = out->dims(); - phi::Copy(dev_ctx, x, false, out); - out->ResizeAndAllocate(out_dims); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + out->Resize(out_dims); } // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu index 0cbf5525d6..4545f9ce43 100644 --- a/paddle/phi/kernels/gpu/copy_kernel.cu +++ b/paddle/phi/kernels/gpu/copy_kernel.cu @@ -28,11 +28,11 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); const auto& src_place = src.place(); - auto dst_place = dst->place(); if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) { PADDLE_THROW(phi::errors::InvalidArgument( @@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->ResizeAndAllocate(src.dims()); - auto* dst_ptr = dst->mutable_data(dst_place); + dst->Resize(src.dims()); + + void* dst_ptr = nullptr; + if (paddle::platform::is_cpu_place(dst_place)) { + dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } else { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + } if (src_ptr == dst_ptr && src_place == dst_place) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " @@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx, auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); - if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_cpu_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT - paddle::platform::is_cpu_place(dst_place)) { + if (paddle::platform::is_gpu_place(src_place) && // NOLINT + paddle::platform::is_cpu_place(dst_place)) { auto src_gpu_place = src_place; auto dst_cpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); @@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx, : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); - } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - auto src_gpu_place = src_place; - auto dst_cuda_pinned_place = dst_place; - auto ctx_place = dev_ctx.GetPlace(); - PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), - true, - phi::errors::PreconditionNotMet( - "Device context place mismatch. When copying Tensor " - "data from GPU memory to CUDA Pinned memory, current " - "device context place should be GPU.")); - auto ctx_gpu_place = ctx_place; - PADDLE_ENFORCE_EQ(src_gpu_place, - ctx_gpu_place, - phi::errors::PreconditionNotMet( - "The source GPU device and current device context do " - "not match. The source GPU device number is %d, but " - "device context GPU number is %d.", - src_gpu_place.device, - ctx_gpu_place.device)); - auto stream = - blocking ? nullptr - : reinterpret_cast(dev_ctx).stream(); - paddle::memory::Copy( - dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_gpu_place(dst_place)) { - auto src_cuda_pinned_place = src_place; - auto dst_gpu_place = dst_place; - auto ctx_place = dev_ctx.GetPlace(); - PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), - true, - phi::errors::PreconditionNotMet( - "Device context place mismatch. When copying Tensor " - "data from CUDA Pinned memory to GPU memory, current " - "device context place should be GPU.")); - auto ctx_gpu_place = ctx_place; - PADDLE_ENFORCE_EQ(dst_gpu_place, - ctx_gpu_place, - phi::errors::PreconditionNotMet( - "The target GPU device and current device context do " - "not match. The target GPU device number is %d, but " - "device context GPU number is %d.", - dst_gpu_place.device, - ctx_gpu_place.device)); - auto stream = - blocking ? nullptr - : reinterpret_cast(dev_ctx).stream(); - paddle::memory::Copy( - dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { auto src_gpu_place = src_place; diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h index 369bd8d8ad..12cafc7023 100644 --- a/paddle/phi/kernels/gpu/elementwise.h +++ b/paddle/phi/kernels/gpu/elementwise.h @@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, auto *dx_data = dx->mutable_data(ctx.GetPlace()); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } } else { // For inplace strategy, dx will be stored in addr of dout, which makes @@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, auto *dy_data = dy->mutable_data(ctx.GetPlace()); if (dy->dims() == dout.dims()) { if (dy_data != dout_data) { - phi::Copy(ctx, dout, false, dy); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); } } else { std::vector reduce_dims = @@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx, if (dx_data == dout_data && dy_data != dout_data) { VLOG(4) << "Special case when dx_data is the same as dout_data, " "only need copy dout to dy"; - phi::Copy(ctx, dout, false, dy); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); } else if (dx_data != dout_data && dy_data == dout_data) { VLOG(4) << "Special case when dy_data is the same as dout_data, " "only need copy dout to dx"; - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } else if (dx_data != dout_data && dy_data != dout_data) { auto size = x.numel(); int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); @@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, auto *dx_data = dx->mutable_data(ctx.GetPlace()); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } } else { // For inplace strategy, dx will be stored in addr of dout, which makes diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 48b2654033..1f756bfdbe 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx, DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); int numel = out->numel(); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (numel > 0) { // in transformer model the numel of outpout will be zero. std::vector inputs = {}; @@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx, static_cast(value))); std::vector inputs = {}; std::vector outputs = {out}; - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); // This function has no input, so the inputs.size() == 0. Use kUnary, but the // data will not be loaded in the kernel because the number of parameters in // the operator is 0 diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index cafcb302d6..460e74b581 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx, x_grad->dims() == out_grad.dims()) { VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't " "reduce"; - phi::Copy(dev_ctx, out_grad, false, x_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); } else if (x_grad == nullptr && y_grad != nullptr && y_grad->dims() == out_grad.dims()) { VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't " "reduce"; - phi::Copy(dev_ctx, out_grad, false, y_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad); } else { grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis); } diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h index 889b560dd7..766f91cd22 100644 --- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h @@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx, } // no need reduce, just copy if (just_copy) { - phi::Copy(ctx, out_grad, false, in_grad); + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad); } else { PADDLE_ENFORCE_GE(dims, 1, diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/impl/size_kernel_impl.h index 9a873871d7..7b781dba3a 100644 --- a/paddle/phi/kernels/impl/size_kernel_impl.h +++ b/paddle/phi/kernels/impl/size_kernel_impl.h @@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx, cpu_tensor.Resize(out->dims()); auto cpu_data = ctx.template HostAlloc(&cpu_tensor); cpu_data[0] = input.numel(); - phi::Copy(ctx, cpu_tensor, false, out); + phi::Copy(ctx, cpu_tensor, place, false, out); } } diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc index 5361315bb6..3813296640 100644 --- a/paddle/phi/kernels/reshape_grad_kernel.cc +++ b/paddle/phi/kernels/reshape_grad_kernel.cc @@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, DenseTensor* x_grad) { auto x_dims = x_grad->dims(); - phi::Copy(dev_ctx, out_grad, false, x_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); x_grad->Resize(x_dims); } diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc index 570e70ea11..f758d7c705 100644 --- a/paddle/phi/kernels/reshape_kernel.cc +++ b/paddle/phi/kernels/reshape_kernel.cc @@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx, // TODO(chenweihang): the output dims are overwrite after copying, // here we need to use copy method that only copy data auto dims = out->dims(); - phi::Copy(dev_ctx, x, false, out); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); out->Resize(dims); out->ResetLoD(x.lod()); } diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc index 58efbafc88..fb931ef18a 100644 --- a/paddle/phi/kernels/xpu/copy_kernel.cc +++ b/paddle/phi/kernels/xpu/copy_kernel.cc @@ -27,12 +27,19 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); - auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + void* dst_ptr = nullptr; + + dst->Resize(src.dims()); + if (paddle::platform::is_cpu_place(dst_place)) { + dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } else { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + } const auto& src_place = src.place(); - const auto& dst_place = dst->place(); if (src_ptr == dst_ptr && src_place == dst_place) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " @@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->ResizeAndAllocate(src.dims()); + CHECK(dst->layout() == src.layout()); auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc index fd8a127b7c..e5fc9c5b1f 100644 --- a/paddle/phi/tests/api/test_matmul_api.cc +++ b/paddle/phi/tests/api/test_matmul_api.cc @@ -127,8 +127,8 @@ TEST(API, matmul_cuda) { auto place = paddle::platform::CUDAPlace(); auto* dev_ctx = static_cast(pool.GetByPlace(place)); - phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get()); - phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get()); + phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get()); + phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get()); paddle::experimental::Tensor x(dense_x); paddle::experimental::Tensor y(dense_y); @@ -152,7 +152,7 @@ TEST(API, matmul_cuda) { phi::DenseTensorMeta( phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW)); - phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get()); + phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get()); for (size_t i = 0; i < 9; i++) { ASSERT_NEAR(sum[i], ref_out->data()[i], 1e-6f); diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc index 4cd283d925..d69c7b2174 100644 --- a/paddle/phi/tests/kernels/test_copy_dev_api.cc +++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc @@ -62,7 +62,8 @@ TEST(DEV_API, copy) { .GetAllocator(paddle::platform::CPUPlace()) .get()); dev_ctx.Init(); - phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); + phi::Copy( + dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get()); // 3. check result for (int64_t i = 0; i < dense_src->numel(); i++) { diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc index 8e825b7790..e4f80a5bd1 100644 --- a/paddle/phi/tests/kernels/test_creation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc @@ -39,7 +39,7 @@ TEST(DEV_API, empty) { dev_ctx.Init(); // 2. test API - auto out = phi::Empty(dev_ctx, {3, 2}, phi::DataType::INT32); + auto out = phi::Empty(dev_ctx, {3, 2}, phi::DataType::INT32); // 3. check result ASSERT_EQ(out.dims().size(), 2); diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc index a75ca633b0..15c00d385e 100644 --- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc @@ -53,8 +53,8 @@ inline void CheckResult( DenseTensorMeta(real_elements.dtype(), real_elements.dims(), real_elements.layout())); - phi::Copy(*dev_ctx_gpu, real_indices, true, &indices); - phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); + phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices); + phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements); int cmp_indices = memcmp(indices.data(), non_zero_indices.data(), @@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, cuda_alloc.get(), DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout())); - phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); + phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x); auto sparse_out = sparse::DenseToSparseCoo(dev_ctx_gpu, d_dense_x, sparse_dim); CheckResult(&dev_ctx_gpu, @@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims, phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, crows, true, &d_crows); - phi::Copy(dev_ctx_gpu, cols, true, &d_cols); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows); + phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols); + phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCsrToCoo(dev_ctx_gpu, d_csr); CheckResult(&dev_ctx_gpu, @@ -406,9 +406,9 @@ inline void CheckCsrResult( DenseTensorMeta(real_elements.dtype(), real_elements.dims(), real_elements.layout())); - phi::Copy(*dev_ctx_gpu, real_crows, true, &crows); - phi::Copy(*dev_ctx_gpu, real_cols, true, &cols); - phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); + phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows); + phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols); + phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements); int cmp_crows = memcmp(crows.data(), non_zero_crows.data(), @@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims, dev_ctx_gpu.PartialInitWithAllocator(); phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, indices, true, &d_indices); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices); + phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values); phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCooToCsr(dev_ctx_gpu, d_coo); CheckCsrResult(&dev_ctx_gpu, @@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, .GetAllocator(phi::CPUPlace()) .get()); dev_ctx_gpu.PartialInitWithAllocator(); - phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); + phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x); auto sparse_out = sparse::DenseToSparseCsr(dev_ctx_gpu, d_dense_x); CheckCsrResult(&dev_ctx_gpu, @@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims, dev_ctx_gpu.PartialInitWithAllocator(); DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta()); DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta()); - phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices); - phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements); + phi::Copy( + dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices); + phi::Copy( + dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements); SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims); auto dense_out_cuda = sparse::SparseCooToDense(dev_ctx_gpu, coo_cuda); @@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims, DenseTensorMeta(dense_out_cuda.dtype(), dense_out_cuda.dims(), dense_out_cuda.layout())); - phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out); + phi::Copy( + dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out); int cmp_cuda = memcmp( &dense_data[0], h_dense_out.data(), sizeof(T) * dense_data.size()); ASSERT_EQ(cmp_cuda, 0); @@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims, phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, crows, true, &d_crows); - phi::Copy(dev_ctx_gpu, cols, true, &d_cols); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows); + phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols); + phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCsrToDense(dev_ctx_gpu, d_csr); phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta()); - phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out); + phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out); int cmp_cuda = memcmp(h_out.data(), dense_data.data(), sizeof(T) * dense_data.size()); ASSERT_EQ(cmp_cuda, 0); -- GitLab