[PTen] Move copy kernel impl (#38421)

* add register general kernel marco * move copy kernel impl * revert needless change * polish details * fix xpu compil faild * fix xpu compile failed * polish format

[PTen] Move copy kernel impl (#38421)
* add register general kernel marco * move copy kernel impl * revert needless change * polish details * fix xpu compil faild * fix xpu compile failed * polish format
73819658 · Chen Weihang · GitHub · e5c7ca48 · 73819658 · 73819658
19 changed file
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -275,7 +275,7 @@ if(WITH_PYTHON)
  if(NOT ON_INFER)
    cc_library(paddle_eager
    SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node accumulation_node global_utils utils python)
    add_dependencies(paddle_eager eager_codegen)
    add_dependencies(paddle_eager eager_op_function_generator_cmd)
    list(APPEND PYBIND_DEPS paddle_eager)

--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -25,6 +25,8 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+# keep this message for debug, remove it later if needless
+message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu)
 set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)

--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -33,15 +33,6 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)
 # auto build kernel targets by cmake
 register_kernels(EXCLUDES flatten_kernel DEPS ${COMMON_KERNEL_DEPS})
 # TODO(chenweihang): auto parse compile deps by include headers later
-set(FLATTEN_DEPS ${COMMON_KERNEL_DEPS} utils_cpu unary)
+kernel_library(flatten_kernel DEPS ${COMMON_KERNEL_DEPS} copy_kernel unary)
-if(WITH_GPU OR WITH_ROCM)
-  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_gpu)
-elseif(WITH_XPU)
-  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_xpu)
-endif()
-kernel_library(flatten_kernel DEPS ${FLATTEN_DEPS})
-get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
-message(STATUS "PTEN_KERNELS: ${pten_kernels}")
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
--- a/paddle/pten/kernels/cpu/utils.h
+++ b/paddle/pten/kernels/cpu/utils.h
@@ -14,13 +14,12 @@ limitations under the License. */
 #pragma once
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
 namespace pten {
-void Copy(const CPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
          const DenseTensor& src,
          bool blocking,
          DenseTensor* dst);

--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
-cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
-cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -12,15 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 namespace pten {
 // NOTE(chenweihang): blocking is useless in cpu kernel
-void Copy(const CPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
          const DenseTensor& src,
          bool blocking,
          DenseTensor* dst) {
@@ -57,4 +63,5 @@ void Copy(const CPUContext& dev_ctx,
 }  // namespace pten
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, CPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, CPU, ALL_LAYOUT, pten::Copy<pten::CPUContext>, ALL_DTYPE) {}
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"

--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -16,10 +16,8 @@
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
-#include "paddle/pten/kernels/gpu/utils.h"
-#include "paddle/pten/kernels/xpu/utils.h"
 namespace pten {

--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
 if(WITH_GPU)
  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
  nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
-  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
 elseif(WITH_ROCM)
  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
  hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
-  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
 endif()
--- a/paddle/pten/kernels/gpu/utils.cu
+++ b/paddle/pten/kernels/gpu/utils.cu
@@ -12,15 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 namespace pten {
-void Copy(const GPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
          const DenseTensor& src,
          bool blocking,
          DenseTensor* dst) {
@@ -232,6 +237,8 @@ void Copy(const GPUContext& dev_ctx,
    }
  }
 }
 }  // namespace pten
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, GPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, GPU, ALL_LAYOUT, pten::Copy<pten::GPUContext>, ALL_DTYPE) {}
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -14,8 +14,8 @@
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/gpu/utils.h"
 #include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
@@ -85,6 +85,7 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
-PT_REGISTER_NO_TEMPLATE_KERNEL(reshape, GPU, ANY, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape_with_xshape, GPU, ANY, pten::ReshapeWithXShape, ALL_DTYPE) {}
+    reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
+PT_REGISTER_NO_TEMPLATE_KERNEL(
+    reshape_with_xshape, GPU, ALL_LAYOUT, pten::ReshapeWithXShape, ALL_DTYPE) {}
--- a/paddle/pten/kernels/gpu/utils.h
+++ b/paddle/pten/kernels/gpu/utils.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-namespace pten {
-void Copy(const GPUContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
-}  // namespace pten
-#endif
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -41,7 +41,7 @@ namespace cub = hipcub;
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"
 // Reduce split or not, Whether to use ReduceHigherDim

--- a/paddle/pten/kernels/xpu/CMakeLists.txt
+++ b/paddle/pten/kernels/xpu/CMakeLists.txt
-cc_library(utils_xpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
-cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_xpu unary)
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -12,14 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pten/kernels/xpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 namespace pten {
-void Copy(const XPUDeviceContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
          const DenseTensor& src,
          bool blocking,
          DenseTensor* dst) {
@@ -76,4 +82,5 @@ void Copy(const XPUDeviceContext& dev_ctx,
 }  // namespace pten
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, XPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, XPU, ALL_LAYOUT, pten::Copy<pten::XPUContext>, ALL_DTYPE) {}
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -14,8 +14,8 @@
 #include "paddle/pten/kernels/xpu/manipulation.h"
 #include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/xpu/utils.h"
 namespace pten {

--- a/paddle/pten/kernels/xpu/utils.h
+++ b/paddle/pten/kernels/xpu/utils.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef PADDLE_WITH_XPU
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-namespace pten {
-using XPUDeviceContext = paddle::platform::XPUDeviceContext;
-void Copy(const XPUDeviceContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
-}  // namespace pten
-#endif
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 namespace paddle {
 namespace tests {

--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -28,8 +28,7 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 // TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
-// in
+// in 'paddle/api'
-// 'paddle/api',
 TEST(DEV_API, copy) {
  // 1. create tensor
  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(