From 822a2d1f37bb916c5eda1f3deb4f97b86c352a1b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 28 Mar 2022 14:05:21 +0800
Subject: [PATCH] [Phi] Fix assign kernel bug (#40927)

* fix assign kernel bug

* fix xpu kernel select error

* add cudn pinned place

* fix copy error

* fix infrt error
---
 cmake/operators.cmake                      |  6 ++++++
 paddle/phi/kernels/assign_kernel.cc        | 18 ++++++++++++------
 paddle/phi/kernels/gpu/copy_kernel.cu      | 20 ++++++++++++--------
 tools/infrt/generate_phi_kernel_dialect.py |  2 +-
 4 files changed, 31 insertions(+), 15 deletions(-)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 1291e60cfe..1df9e14973 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -394,6 +394,12 @@ function(op_library TARGET)
         if(NOT ${op_name} EQUAL "")
             file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
             set(pybind_flag 1)
+        else()
+            find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name)
+            if(NOT ${op_name} EQUAL "")
+                file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
+                set(pybind_flag 1)
+            endif()
         endif()
         endforeach()
     endif()
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index e5379e97c4..bd86d33ed3 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -26,11 +26,13 @@ template <typename Context>
 void AssignKernel(const Context& dev_ctx,
                   paddle::optional<const DenseTensor&> x,
                   DenseTensor* out) {
-  if (!x.is_initialized()) {
-    return;
+  if (x.get_ptr()) {
+    if (!x.is_initialized()) {
+      return;
+    }
+    auto& x_tensor = *x.get_ptr();
+    Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
   }
-  auto& x_tensor = *x.get_ptr();
-  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
 }
 
 // Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
@@ -103,7 +105,9 @@ void AssignValueKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_GENERAL_KERNEL(
-    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_GENERAL_KERNEL(assign_array,
                            CPU,
                            ALL_LAYOUT,
@@ -120,7 +124,9 @@ PD_REGISTER_KERNEL(assign_value,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_GENERAL_KERNEL(
-    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_GENERAL_KERNEL(assign_array,
                            GPU,
                            ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index a16c8369cc..28dc6f196d 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -34,12 +35,6 @@ void Copy(const Context& dev_ctx,
   auto* src_ptr = src.data();
   const auto& src_place = src.place();
 
-  if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "The src and dst tensor are all CPU tensor, you should call copy "
-        "function in CPU mode."));
-  }
-
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
 
@@ -48,6 +43,10 @@ void Copy(const Context& dev_ctx,
   void* dst_ptr = nullptr;
   if (paddle::platform::is_cpu_place(dst_place)) {
     dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  } else if (paddle::platform::is_cuda_pinned_place(dst_place)) {
+    // now we only can use mutable_data to Alloc pinned memory here,
+    // dev_ctx can not alloc pinned memory now
+    dst_ptr = dst->mutable_data(dst_place, src.dtype());
   } else {
     dst_ptr = dev_ctx.Alloc(dst, src.dtype());
   }
@@ -63,8 +62,13 @@ void Copy(const Context& dev_ctx,
 
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
 
-  if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
-      paddle::platform::is_cpu_place(dst_place)) {
+  if ((paddle::platform::is_cpu_place(src_place) ||
+       paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
+      (paddle::platform::is_cpu_place(dst_place) ||
+       paddle::platform::is_cuda_pinned_place(dst_place))) {
+    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = src_place;
     auto dst_cpu_place = dst_place;
     auto ctx_place = dev_ctx.GetPlace();
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index bfe1e7e88b..4ac8a2e127 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -27,7 +27,7 @@ attr_type_converter = {
     "St6vectorIiSaIiEE": 'I32ArrayAttr'
 }
 
-target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
+target_type_converter = {"CPU": "CPU", "GPU": "GPU", "Undefined": "UNK"}
 layout_type_converter = {
     "NCHW": "NCHW",
     "NHWC": "NHWC",
-- 
GitLab