[PTen] Move inner empty and cast api to kernel.h (#38587)

* move inner cast api to cast_kernel.h * resolve conflit

[PTen] Move inner empty and cast api to kernel.h (#38587)
* move inner cast api to cast_kernel.h * resolve conflit
64538c8d · Chen Weihang · GitHub · 59888bba · 64538c8d · 64538c8d
11 changed file
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -71,7 +71,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
        static_cast<framework::proto::VarType::Type>(out_dtype));

    // call new kernel
-    pten::Cast<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_out.get());
+    pten::CastKernel<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_out.get());
  }
 };


--- a/paddle/pten/include/creation.h
+++ b/paddle/pten/include/creation.h
@@ -23,37 +23,6 @@ namespace pten {

 // TODO(YuanRisheng) This function name should be same as User API name.
 // TODO(zyfncg) Automatic code generation
-template <typename T, typename ContextT>
-DenseTensor Empty(const ContextT& dev_ctx,
-                  const ScalarArray& shape,
-                  DataType dtype = DataType::FLOAT32,
-                  Backend backend = Backend::CPU,  // Is backend needed here?
-                  DataLayout layout = DataLayout::NCHW) {
-  auto out_meta = CreateInferMeta(shape, dtype, layout);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Empty<T, ContextT>(dev_ctx, shape, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor EmptyLike(
-    const ContextT& dev_ctx,
-    const DenseTensor& x,
-    DataType dtype = DataType::UNDEFINED,
-    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
-    DataLayout layout = DataLayout::UNDEFINED) {
-  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  EmptyLike<T, ContextT>(dev_ctx, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Full(const ContextT& dev_ctx,
                 const ScalarArray& shape,

--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -37,19 +37,6 @@ DenseTensor Flatten(const ContextT& dev_ctx,
  return dense_out;
 }

-template <typename T, typename ContextT>
-DenseTensor Cast(const ContextT& dev_ctx,
-                 const DenseTensor& x,
-                 DataType out_dtype) {
-  auto out_meta = CastInferMeta(x.meta(), out_dtype);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Cast<T, ContextT>(dev_ctx, x, out_dtype, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Reshape(const ContextT& dev_ctx,
                    const DenseTensor& x,

--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -26,6 +26,8 @@ set_property(GLOBAL PROPERTY PTEN_KERNELS "")

 set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
+# remove this dep after removing fluid deps on tensor creation
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)

 set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu)

--- a/paddle/pten/kernels/cast_kernel.h
+++ b/paddle/pten/kernels/cast_kernel.h
@@ -15,13 +15,24 @@ limitations under the License. */
 #pragma once

 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/empty_kernel.h"

 namespace pten {

-template <typename T, typename ContextT>
-void Cast(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
                const DenseTensor& x,
                DataType out_dtype,
                DenseTensor* out);

+template <typename T, typename Context>
+DenseTensor Cast(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DataType out_dtype) {
+  auto out_meta = CastInferMeta(x.meta(), out_dtype);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
--- a/paddle/pten/kernels/cpu/cast_kernel.cc
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -46,8 +46,8 @@ void CastKernelImpl(const CPUContext& dev_ctx,
        CastOpTransformFunctor<InT, OutT>());
 }

-template <typename T, typename ContextT>
-void Cast(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
                const DenseTensor& x,
                DataType out_dtype,
                DenseTensor* out) {
@@ -61,7 +61,7 @@ void Cast(const ContextT& dev_ctx,
 PT_REGISTER_CTX_KERNEL(cast,
                       CPU,
                       ALL_LAYOUT,
-                       pten::Cast,
+                       pten::CastKernel,
                       float,
                       double,
                       int,

--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace pten {

 template <typename T, typename ContextT>
-void Empty(const ContextT& dev_ctx,
+void EmptyKernel(const ContextT& dev_ctx,
                 const ScalarArray& shape,
                 DenseTensor* out) {
  out->Resize(paddle::framework::make_ddim(shape.GetData()));
 }

 template <typename T, typename ContextT>
-void EmptyLike(const ContextT& dev_ctx, DenseTensor* out) {
+void EmptyLikeKernel(const ContextT& dev_ctx, DenseTensor* out) {
  out->mutable_data<T>();
 }

@@ -36,7 +36,7 @@ void EmptyLike(const ContextT& dev_ctx, DenseTensor* out) {
 PT_REGISTER_CTX_KERNEL(empty,
                       CPU,
                       ALL_LAYOUT,
-                       pten::Empty,
+                       pten::EmptyKernel,
                       bool,
                       int,
                       int64_t,
@@ -47,7 +47,7 @@ PT_REGISTER_CTX_KERNEL(empty,
 PT_REGISTER_CTX_KERNEL(empty_like,
                       CPU,
                       ALL_LAYOUT,
-                       pten::EmptyLike,
+                       pten::EmptyLikeKernel,
                       bool,
                       int,
                       int64_t,
@@ -59,7 +59,7 @@ PT_REGISTER_CTX_KERNEL(empty_like,
 PT_REGISTER_CTX_KERNEL(empty,
                       GPU,
                       ALL_LAYOUT,
-                       pten::Empty,
+                       pten::EmptyKernel,
                       bool,
                       int,
                       int64_t,
@@ -70,7 +70,7 @@ PT_REGISTER_CTX_KERNEL(empty,
 PT_REGISTER_CTX_KERNEL(empty_like,
                       GPU,
                       ALL_LAYOUT,
-                       pten::EmptyLike,
+                       pten::EmptyLikeKernel,
                       bool,
                       int,
                       int64_t,

--- a/paddle/pten/kernels/empty_kernel.h
+++ b/paddle/pten/kernels/empty_kernel.h
@@ -14,15 +14,56 @@

 #pragma once

+#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/unary.h"

 namespace pten {

-template <typename T, typename ContextT>
-void Empty(const ContextT& dev_ctx, const ScalarArray& shape, DenseTensor* out);
+template <typename T, typename Context>
+void EmptyKernel(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 DenseTensor* out);

-template <typename T, typename ContextT>
-void EmptyLike(const ContextT& dev_ctx, DenseTensor* out);
+template <typename T, typename Context>
+void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out);
+
+// TODO(chenweihang): the tensor creation method need to be replaced later,
+// all kernel api call Empty here instead of making tensor self
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(meta));
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx,
+                  const ScalarArray& shape,
+                  DataType dtype = DataType::FLOAT32,
+                  Backend backend = Backend::CPU,  // Is backend needed here?
+                  DataLayout layout = DataLayout::NCHW) {
+  auto out_meta = CreateInferMeta(shape, dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  EmptyKernel<T, Context>(dev_ctx, shape, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor EmptyLike(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    DataType dtype = DataType::UNDEFINED,
+    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
+    DataLayout layout = DataLayout::UNDEFINED) {
+  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  EmptyLikeKernel<T, Context>(dev_ctx, &dense_out);
+  return dense_out;
+}

 }  // namespace pten
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -85,8 +85,8 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
  CastCUDAKernelImplWithPtr(dev_ctx, in_data, out_data, size);
 }

-template <typename T, typename ContextT>
-void Cast(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
                const DenseTensor& x,
                DataType out_dtype,
                DenseTensor* out) {
@@ -101,7 +101,7 @@ void Cast(const ContextT& dev_ctx,
  PT_REGISTER_CTX_KERNEL(cast,                              \
                         GPU,                               \
                         ALL_LAYOUT,                        \
-                         pten::Cast,                        \
+                         pten::CastKernel,                  \
                         float,                             \
                         double,                            \
                         int,                               \

--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -1112,7 +1112,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
      AsyncCopy(x, y);
      y->Resize(out_dims);
    } else {
-      pten::Cast<Tx>(*dev_ctx, x, y->dtype(), y);
+      pten::CastKernel<Tx>(*dev_ctx, x, y->dtype(), y);
    }
    return;
  }

--- a/paddle/pten/kernels/hybird/general/reduce_impl.h
+++ b/paddle/pten/kernels/hybird/general/reduce_impl.h
@@ -60,7 +60,7 @@ void Reduce(const DeviceContext& dev_ctx,
        pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));

    // cast x tensor to out_dtype
-    pten::Cast<T, DeviceContext>(dev_ctx, x, out_dtype, &tmp_tensor);
+    pten::CastKernel<T, DeviceContext>(dev_ctx, x, out_dtype, &tmp_tensor);

    // do reduce sum
    PD_VISIT_ALL_TYPES(