From 822a2d1f37bb916c5eda1f3deb4f97b86c352a1b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 28 Mar 2022 14:05:21 +0800 Subject: [PATCH] [Phi] Fix assign kernel bug (#40927) * fix assign kernel bug * fix xpu kernel select error * add cudn pinned place * fix copy error * fix infrt error --- cmake/operators.cmake | 6 ++++++ paddle/phi/kernels/assign_kernel.cc | 18 ++++++++++++------ paddle/phi/kernels/gpu/copy_kernel.cu | 20 ++++++++++++-------- tools/infrt/generate_phi_kernel_dialect.py | 2 +- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 1291e60cfe..1df9e14973 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -394,6 +394,12 @@ function(op_library TARGET) if(NOT ${op_name} EQUAL "") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") set(pybind_flag 1) + else() + find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n") + set(pybind_flag 1) + endif() endif() endforeach() endif() diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc index e5379e97c4..bd86d33ed3 100644 --- a/paddle/phi/kernels/assign_kernel.cc +++ b/paddle/phi/kernels/assign_kernel.cc @@ -26,11 +26,13 @@ template void AssignKernel(const Context& dev_ctx, paddle::optional x, DenseTensor* out) { - if (!x.is_initialized()) { - return; + if (x.get_ptr()) { + if (!x.is_initialized()) { + return; + } + auto& x_tensor = *x.get_ptr(); + Copy(dev_ctx, x_tensor, x_tensor.place(), false, out); } - auto& x_tensor = *x.get_ptr(); - Copy(dev_ctx, x_tensor, x_tensor.place(), false, out); } // Note: use `const paddle::optional&> x` @@ -103,7 +105,9 @@ void AssignValueKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_GENERAL_KERNEL( - assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} + assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_GENERAL_KERNEL(assign_array, CPU, ALL_LAYOUT, @@ -120,7 +124,9 @@ PD_REGISTER_KERNEL(assign_value, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_GENERAL_KERNEL( - assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} + assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_GENERAL_KERNEL(assign_array, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu index a16c8369cc..28dc6f196d 100644 --- a/paddle/phi/kernels/gpu/copy_kernel.cu +++ b/paddle/phi/kernels/gpu/copy_kernel.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" @@ -34,12 +35,6 @@ void Copy(const Context& dev_ctx, auto* src_ptr = src.data(); const auto& src_place = src.place(); - if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) { - PADDLE_THROW(phi::errors::InvalidArgument( - "The src and dst tensor are all CPU tensor, you should call copy " - "function in CPU mode.")); - } - VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; @@ -48,6 +43,10 @@ void Copy(const Context& dev_ctx, void* dst_ptr = nullptr; if (paddle::platform::is_cpu_place(dst_place)) { dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } else if (paddle::platform::is_cuda_pinned_place(dst_place)) { + // now we only can use mutable_data to Alloc pinned memory here, + // dev_ctx can not alloc pinned memory now + dst_ptr = dst->mutable_data(dst_place, src.dtype()); } else { dst_ptr = dev_ctx.Alloc(dst, src.dtype()); } @@ -63,8 +62,13 @@ void Copy(const Context& dev_ctx, auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); - if (paddle::platform::is_gpu_place(src_place) && // NOLINT - paddle::platform::is_cpu_place(dst_place)) { + if ((paddle::platform::is_cpu_place(src_place) || + paddle::platform::is_cuda_pinned_place(src_place)) && // NOLINT + (paddle::platform::is_cpu_place(dst_place) || + paddle::platform::is_cuda_pinned_place(dst_place))) { + paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT + paddle::platform::is_cpu_place(dst_place)) { auto src_gpu_place = src_place; auto dst_cpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index bfe1e7e88b..4ac8a2e127 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -27,7 +27,7 @@ attr_type_converter = { "St6vectorIiSaIiEE": 'I32ArrayAttr' } -target_type_converter = {"CPU": "CPU", "GPU": "GPU"} +target_type_converter = {"CPU": "CPU", "GPU": "GPU", "Undefined": "UNK"} layout_type_converter = { "NCHW": "NCHW", "NHWC": "NHWC", -- GitLab