[PTen] Move cast kernel impl (#38382)

* rename to api to copy_to * revert needless change * polish format

[PTen] Move cast kernel impl (#38382)
* rename to api to copy_to * revert needless change * polish format
1fb734d7 · Chen Weihang · GitHub · 04527ee3 · 1fb734d7 · 1fb734d7
14 changed file
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -21,12 +21,12 @@ limitations under the License. */
 // file name of the kernel, and this header file will be removed

 PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 #endif


--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -17,6 +17,7 @@
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
 #include "paddle/pten/kernels/gpu/manipulation.h"
@@ -48,7 +49,7 @@ DenseTensor Cast(const ContextT& dev_ctx,
      pten::make_intrusive<paddle::experimental::SharedStorage>(
          dev_ctx.GetPlace()),
      std::move(out_meta));
-  Cast<T>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
+  Cast<T, ContextT>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
  return dense_out;
 }


--- a/paddle/pten/kernels/cast_kernel.h
+++ b/paddle/pten/kernels/cast_kernel.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out);
+
+}  // namespace pten
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
-cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
+cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
--- a/paddle/pten/kernels/hybird/math/cast_func.h
+++ b/paddle/pten/kernels/hybird/math/cast_func.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#pragma once
+#include "paddle/pten/kernels/cast_kernel.h"

+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/transform.h"
-#include "paddle/pten/core/dense_tensor.h"

 namespace pten {
-namespace math {

 template <typename InT, typename OutT>
 struct CastOpTransformFunctor {
  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
 };

-template <typename DeviceContext, typename InT, typename OutT>
-void CastKernelImpl(const DeviceContext& dev_ctx,
+template <typename InT, typename OutT>
+void CastKernelImpl(const CPUContext& dev_ctx,
                    const DenseTensor& x,
                    DenseTensor* out) {
  auto* in_begin = x.data<InT>();
@@ -35,7 +38,7 @@ void CastKernelImpl(const DeviceContext& dev_ctx,

  auto* out_begin = out->mutable_data<OutT>();

-  paddle::platform::Transform<DeviceContext> trans;
+  paddle::platform::Transform<CPUContext> trans;
  trans(dev_ctx,
        in_begin,
        in_end,
@@ -43,6 +46,33 @@ void CastKernelImpl(const DeviceContext& dev_ctx,
        CastOpTransformFunctor<InT, OutT>());
 }

-}  // namespace math
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
+                       CastKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}

 }  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(cast,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Cast,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       int16_t,
+                       bool,
+                       uint8_t,
+                       paddle::platform::float16,
+                       paddle::platform::bfloat16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -17,7 +17,6 @@
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"

 namespace pten {

@@ -44,38 +43,8 @@ void ReshapeWithXShape(const CPUContext& dev_ctx,
  Reshape(dev_ctx, x, shape, out);
 }

-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       math::CastKernelImpl<CPUContext, T, data_t>(
-                           dev_ctx, x, out);
-                     }));
-}
-
 }  // namespace pten

-PT_REGISTER_KERNEL(cast,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Cast,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   int16_t,
-                   bool,
-                   uint8_t,
-                   paddle::platform::float16,
-                   paddle::platform::bfloat16,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
 PT_REGISTER_NO_TEMPLATE_KERNEL(
    reshape, CPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(

--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -21,13 +21,6 @@ limitations under the License. */

 namespace pten {

-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
 void Reshape(const CPUContext& dev_ctx,
             const DenseTensor& x,
             const ScalarArray& shape,

--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
 if(WITH_GPU)
-  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
  nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 elseif(WITH_ROCM)
-  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
  hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 endif()
--- a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
@@ -13,15 +13,21 @@
 // limitations under the License.

 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/float16.h"
+
+#include "paddle/pten/kernels/cast_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"

+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/float16.h"
+
 namespace pten {
-namespace detail {

 template <typename InT, typename OutT, int VecSize>
 __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
@@ -74,6 +80,41 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
  }
 }

-}  // namespace detail
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastCUDAKernelImpl", ([&] {
+                       CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}

 }  // namespace pten
+
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...)     \
+  PT_REGISTER_CTX_KERNEL(cast,                              \
+                         GPU,                               \
+                         ALL_LAYOUT,                        \
+                         pten::Cast,                        \
+                         float,                             \
+                         double,                            \
+                         int,                               \
+                         int64_t,                           \
+                         int16_t,                           \
+                         bool,                              \
+                         uint8_t,                           \
+                         paddle::platform::float16,         \
+                         paddle::platform::complex<float>,  \
+                         paddle::platform::complex<double>, \
+                         ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                        \
+        paddle::experimental::DataType::UNDEFINED);         \
+  }
+
+#if !defined(PADDLE_WITH_HIP)
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
+#else
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
+#endif
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/kernels/gpu/manipulation.h"
+
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"

 namespace pten {
@@ -44,47 +43,8 @@ void ReshapeWithXShape(const GPUContext& dev_ctx,
  Reshape(dev_ctx, x, shape, out);
 }

-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       detail::CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
-                     }));
-}
-
 }  // namespace pten

-using float16 = paddle::platform::float16;
-
-#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
-                     GPU,                               \
-                     ALL_LAYOUT,                        \
-                     pten::Cast,                        \
-                     float,                             \
-                     double,                            \
-                     int,                               \
-                     int64_t,                           \
-                     int16_t,                           \
-                     bool,                              \
-                     uint8_t,                           \
-                     paddle::platform::float16,         \
-                     paddle::platform::complex<float>,  \
-                     paddle::platform::complex<double>, \
-                     ##__VA_ARGS__) {                   \
-    kernel->OutputAt(0).SetDataType(                    \
-        paddle::experimental::DataType::UNDEFINED);     \
-  }
-
-#if !defined(PADDLE_WITH_HIP)
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
-
 PT_REGISTER_NO_TEMPLATE_KERNEL(
    reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(

--- a/paddle/pten/kernels/gpu/manipulation.h
+++ b/paddle/pten/kernels/gpu/manipulation.h
@@ -24,13 +24,6 @@

 namespace pten {

-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
 void Reshape(const GPUContext& dev_ctx,
             const DenseTensor& x,
             const ScalarArray& shape,

--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -41,8 +41,8 @@ namespace cub = hipcub;

 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"

 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
@@ -1112,12 +1112,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
      AsyncCopy(x, y);
      y->Resize(out_dims);
    } else {
-      PD_VISIT_ALL_TYPES(
-          y->dtype(), "CastKernelImpl", ([&] {
-            pten::math::CastKernelImpl<paddle::platform::CUDADeviceContext,
-                                       Tx,
-                                       data_t>(*dev_ctx, x, y);
-          }));
+      pten::Cast<Tx>(*dev_ctx, x, y->dtype(), x.dtype(), y);
    }
    return;
  }

--- a/paddle/pten/kernels/hybird/general/reduce_impl.h
+++ b/paddle/pten/kernels/hybird/general/reduce_impl.h
@@ -16,8 +16,9 @@
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
+
 namespace pten {
 namespace general {

@@ -57,11 +58,8 @@ void Reduce(const DeviceContext& dev_ctx,
        pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
        pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));

-    // cast x tensor to out_dtype first
-    PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                         math::CastKernelImpl<DeviceContext, T, data_t>(
-                             dev_ctx, x, &tmp_tensor);
-                       }));
+    // cast x tensor to out_dtype
+    pten::Cast<T, DeviceContext>(dev_ctx, x, out_dtype, x.dtype(), &tmp_tensor);

    // do reduce sum
    PD_VISIT_ALL_TYPES(

--- a/paddle/pten/kernels/hybird/transpose.cu
+++ b/paddle/pten/kernels/hybird/transpose.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
 #include "paddle/pten/kernels/hybird/transpose.h"

 // See Note [ Why still include the fluid headers? ]