From 4d1ce1844b45f71be6f46f6227c67f89c8b10adf Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 22 Dec 2021 11:01:36 +0800
Subject: [PATCH] [PTen]Move flatten kernel to new directory (#38255)

* move flatten

* fix bugs of test

* modify header file

* add copy declare

* fix compile bugs
---
 paddle/fluid/operators/flatten_op.h           |   3 +-
 paddle/pten/CMakeLists.txt                    |   2 +-
 paddle/pten/api/lib/kernel_declare.h          |   3 +
 paddle/pten/include/manipulation.h            |   3 +-
 paddle/pten/kernels/CMakeLists.txt            |  15 +++
 paddle/pten/kernels/cpu/manipulation.cc       |  46 -------
 paddle/pten/kernels/cpu/manipulation.h        |   7 -
 paddle/pten/kernels/flatten_kernel.cc         | 126 ++++++++++++++++++
 paddle/pten/kernels/flatten_kernel.h          |  36 +++++
 paddle/pten/kernels/functions/common_shape.h  |  34 +++++
 paddle/pten/kernels/gpu/manipulation.cu       |  47 -------
 paddle/pten/kernels/gpu/manipulation.h        |   7 -
 paddle/pten/kernels/xpu/manipulation.cc       |  56 --------
 paddle/pten/kernels/xpu/manipulation.h        |   7 -
 .../tests/kernels/test_flatten_dev_api.cc     |  10 ++
 15 files changed, 229 insertions(+), 173 deletions(-)
 create mode 100644 paddle/pten/kernels/flatten_kernel.cc
 create mode 100644 paddle/pten/kernels/flatten_kernel.h
 create mode 100644 paddle/pten/kernels/functions/common_shape.h
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 7d08a958211..29eb579b2a0 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -134,7 +134,8 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Flatten<T>(dev_ctx, *pt_x.get(), start_axis, stop_axis, pt_out.get());
+    pten::Flatten<T, DeviceContext>(dev_ctx, *pt_x.get(), start_axis, stop_axis,
+                                    pt_out.get());
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 799ec885b99..7c870ec3364 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -24,7 +24,7 @@ add_subdirectory(tests)
 
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
-set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu)
+set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu flatten)
 set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
   set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu scale_kernel_gpu full_kernel_gpu)
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index 4dbd46bff65..01a3c193a34 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 PT_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
@@ -30,6 +31,7 @@ PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_DECLARE_KERNEL(full_like, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(flatten, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
@@ -38,4 +40,5 @@ PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
 
 #ifdef PADDLE_WITH_XPU
 PT_DECLARE_KERNEL(flatten, XPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(reshape, XPU, ALL_LAYOUT);
 #endif
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
index e94f2a61807..80bfcef89f7 100644
--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -18,6 +18,7 @@
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
+#include "paddle/pten/kernels/flatten_kernel.h"
 #include "paddle/pten/kernels/gpu/manipulation.h"
 #include "paddle/pten/kernels/xpu/manipulation.h"
 
@@ -33,7 +34,7 @@ DenseTensor Flatten(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  Flatten<T, ContextT>(dev_ctx, x, start_axis, stop_axis, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 818ce6cb77a..bacdc1ce679 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -17,3 +17,18 @@ endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
+
+set(FLATTEN_DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
+if(WITH_GPU OR WITH_ROCM)
+  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_gpu)
+elseif(WITH_XPU)
+  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_xpu)
+endif()
+
+if(WITH_GPU)
+  nv_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
+elseif(WITH_ROCM)
+  hip_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
+else()
+  cc_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
+endif()
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index 32bc8e4e35d..b413882c862 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -21,31 +21,6 @@
 
 namespace pten {
 
-template <typename T>
-void Flatten(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const CPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  general::SetXShape(x, xshape);
-}
-
 void Reshape(const CPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
@@ -83,27 +58,6 @@ void Cast(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_KERNEL(flatten,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
 PT_REGISTER_KERNEL(cast,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
index 1a219dc79e6..5aa5344b192 100644
--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -21,13 +21,6 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename T>
-void Flatten(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
 template <typename T>
 void Cast(const CPUContext& dev_ctx,
           const DenseTensor& x,
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
new file mode 100644
index 00000000000..b2b5d74432a
--- /dev/null
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -0,0 +1,126 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/flatten_kernel.h"
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/functions/common_shape.h"
+#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/xpu/utils.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Flatten(const ContextT& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, false, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T, typename ContextT>
+void FlattenWithXShape(const ContextT& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T, ContextT>(dev_ctx, x, start_axis, stop_axis, out);
+  functions::SetXShape(x, xshape);
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(flatten,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Flatten,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_CTX_KERNEL(flatten,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Flatten,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_CTX_KERNEL(flatten,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::Flatten,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+#endif
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
new file mode 100644
index 00000000000..6ce0a2be20a
--- /dev/null
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Flatten(const ContextT& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+template <typename T, typename ContextT>
+void FlattenWithXShape(const ContextT& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/common_shape.h b/paddle/pten/kernels/functions/common_shape.h
new file mode 100644
index 00000000000..3fa129014eb
--- /dev/null
+++ b/paddle/pten/kernels/functions/common_shape.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+namespace functions {
+
+inline void SetXShape(const DenseTensor& x, DenseTensor* xshape) {
+  const auto& in_dims = x.meta().dims;
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->ResetLoD(x.meta().lod);
+}
+
+}  // namespace functions
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu
index 5a82e3e030b..8c4aa7449a3 100644
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -21,31 +21,6 @@
 
 namespace pten {
 
-template <typename T>
-void Flatten(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const GPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  general::SetXShape(x, xshape);
-}
-
 void Reshape(const GPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
@@ -84,28 +59,6 @@ void Cast(const GPUContext& dev_ctx,
 
 using float16 = paddle::platform::float16;
 
-PT_REGISTER_KERNEL(flatten,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
   PT_REGISTER_KERNEL(cast,                              \
                      GPU,                               \
diff --git a/paddle/pten/kernels/gpu/manipulation.h b/paddle/pten/kernels/gpu/manipulation.h
index b47fadd70bd..af49932c2e5 100644
--- a/paddle/pten/kernels/gpu/manipulation.h
+++ b/paddle/pten/kernels/gpu/manipulation.h
@@ -24,13 +24,6 @@
 
 namespace pten {
 
-template <typename T>
-void Flatten(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
 template <typename T>
 void Cast(const GPUContext& dev_ctx,
           const DenseTensor& x,
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
index 70ac70371e9..ecd673015a6 100644
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -19,38 +19,6 @@
 
 namespace pten {
 
-template <typename T>
-void Flatten(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const XPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  const auto& in_dims = x.dims();
-  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    xshape_dims[i + 1] = in_dims[i];
-  }
-  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
-  xshape->ResetLoD(x.lod());
-}
-
 void Reshape(const XPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
@@ -76,29 +44,5 @@ void ReshapeWithXShape(const XPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_KERNEL(flatten,
-                   XPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   paddle::platform::float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   XPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   paddle::platform::float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
 PT_REGISTER_NO_TEMPLATE_KERNEL(
     reshape, XPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.h b/paddle/pten/kernels/xpu/manipulation.h
index 0b68ae41951..b1557a279fa 100644
--- a/paddle/pten/kernels/xpu/manipulation.h
+++ b/paddle/pten/kernels/xpu/manipulation.h
@@ -23,13 +23,6 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename T>
-void Flatten(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
 void Reshape(const XPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index a351be3cf66..d2ff7480e90 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -21,6 +21,16 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
+PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+#endif
+
 namespace pten {
 namespace tests {
 
-- 
GitLab