From 09d407b047ce296cc4e7f72c7b8512389f81d594 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 5 Nov 2021 11:05:53 +0800
Subject: [PATCH] [PTen]Support XPU for Flatten Kernel (#36957)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* initial tensor design & sign kernel demo

* add move constructor for meta & add lodtensor

* add dirs & sign xpu kernel

* add mean cpu&cuda kernel impl

* move sign & mean xpu & npu kernel

* add selected_rows basic impl

* refactor design, BaseTensor to DenseTensor, etc.

* add scale mkldnn kernel

* polish xpu & npu impl details

* fix mkldnn reuse compile failed

* change tensor operation lib name

* rename util filename

* add more comments

* change TensorImplInterface to TensorInterface

* add kernel key and factory

* remove MKLDNNTensorMeta, add MKLDNNDenseTensor

* change XXDeviceContext to XXContext

* add base kernel registrar utils & test on sign

* replace boost::any by paddle::any

* fix several ci failed

* fix npu compile error

* add ordered map util

* fix multiple ordered_map compile errors

* move dev into include dir

* support sign op in static op run

* fix static op run error

* fix new executor compile failed

* add dygraph branch & remove sign_op.h

* fix test_infer_no_need_buffer_slots

* fix rocm compile link error

* fix unitybuild error & clear glog

* fix npu compile failed

* skip quant trans test

* fix part windows compile problem

* fix xpu enforce error

* fix inference test failed

* remove ordered_map to solve quant failed

* fix part of rcom compile faild

* add more register kernels

* revert scale kernel temporarily

* fix code format error

* add new kernel registrar marco

* rename top to tcmpt

* revert xpu, npu, mkldnn impl & remove op def

* add kernel args parse functor to auto parse args

* revert some change & add scale kernels

* add op proto in dygraph kernelcontext building

* polish kernel dispatch logic & nameing rule

* fix scale kernel match error

* fix scale test failed

* add mean API and unittest

* test mean api success

* add branch to solve compiled error

* skip clang format error

* add mean skip rule in op_library

* add dot kernel, api and unittest (#6)

* remove old kernel and add symbol link

* fix dot compiled failed

* add merco for module declare

* fix npu and xpu compile error

* revert sign, mean, scale, dot kernel removing

* add comment for keeping old kernel impl

* fix mutable_data error

* fix bfloat16 conflit

* fix inference undef error

* adapt to msvc compile rules

* polish comment for template inst

* add cmake template instantiation for win

* fix backend to place device id bug

* fix ifdef error

* Op2functor (#7)

* add kernel args maker class

* make args maker non-const

* remove debug log

* modify codes by review options

* split constructPrKernelContext function

* fix output name bug

* fix test_mean_op test_sign_op failed

* fill_any_like kernel refactor (#10)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* skip dtype for fill_any_like

* add attrs for kernel key constrcut

* add use_pt_kernel Flags to control whether to use pt kernel (#13)

* add use_pt_kernel Flags to control whether to use pt kernel

* change the default value to true for cheking pt kernels

* fix mutable_data cuda place error

* move high level apis into hapi

* remove selectedrows adapting temporarily

* Support Scalar in Tensor Compute Library (#14)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* remove mkldnn tensor & polish details

* use flat_hash_map and small_vector in kernel factory

* Refactor flatten kernel (#12)

* refactor flatten kernel

* update infershape function

* fix compile bugs

* fix bugs when merge

* fix compiler bugs

* fix bugs when run test_flatten_api

* fix bugs when run test

* Revert "use flat_hash_map and small_vector in kernel factory"

This reverts commit 23091495cfdd3df8cc1be592d30f09ea66a7c72b.

* Move cpu, cuda and other device code into kernels (#15)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc

* Perfect unitests (#16)

* perfect unittest

* update license

* replace with flat_hash_map, small_vector (#19)

* fix small_vector build error on windows platform

* replace with flat_hash_map, small_vector

* remove todo

* Perfect unitests (#20)

* perfect unittest

* update license

* fix bug when run tcmpt_utils_test

* refactor execution adapting impl

* fix insert conflit

* Fix CI bug of test_yolov3 (#21)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc

* Fix CI bug of test_yolov3

* add the tensor base class, test=develop (#17)

* update the tensor base class, test=develop

* remove two funcs, test=develop

* update the error msg, test=develop

Co-authored-by: Chen Weihang <chenweihang@baidu.com>

* [no-verify] commit backend and tensor signature changes

* Rename tcmpt to pten (#23)

* rename tcmpt to pten

* update omitted files for rename to pten

* update omitted file for rename to pten

* remove k of all enum var

* remove kernel_instantiate (#26)

* remove symbols and spatial_tensor

* change common to functions

* readd share tensor impl methods

* add a candidate dense tensor class, test=develop (#28)

* change all Pt to Pten

* resolve conflit with xiaowei

* Op2functor opt1 (#27)

* replace to small vector and change to const &

* add std::move

Co-authored-by: Chen Weihang <chenweihang@baidu.com>

* polish kernel factory and kernel registry

* fix operator test error msg mismatch

* remove tensor signature and backend set member

* move scalar and polish enforce

* revert dtype layout change to fix error

* fix enum operator override error

* Add Intermediate API layer

* add several base unittests

* add pten utils tests

* polish some details

* Dev/op2func refactor 3 (#30)

* add a candidate dense tensor class, test=develop

* remove TensorBase::backend(), test=develop

* remove some ops, test=develop

* cherry-pick the pr of tensor meta, test=develop

* moves the dense tensor and some ops, test=develop

* update the linalg operator, test=develop

* update other operators, test=develop

* fix errors, test=develop

* fix bugs, test=develop

* try to resolve the problem of windows ci, test=develop

* updates codes, test=develop

* fix the tensor_utils.cc, test=develop

* modify the dense tensor, test=develop

* fix the data type, test=develop

Co-authored-by: shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>

* intermediate api adapt to new dense tensor

* add some TODO and delete include header

* Support XPU for Flatten Kernel

* fix bugs when run kunlun ci

* fix compile bugs

* fix bugs for kunlun ci

* fix compile bugs when run kunlun

* fix compile bugs in kunlun

* fix compile bugs in kunlun

* fix bugs when compile

* fix bugs when compile

* fix compile bug

* delete useless annotation

Co-authored-by: Chen Weihang <chenweihang@baidu.com>
Co-authored-by: chentianyu03 <ctychentianyu@gmail.com>
Co-authored-by: zyfncg <1370305206@qq.com>
Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
---
 paddle/fluid/operators/flatten_op.cc    | 44 ++++---------
 paddle/fluid/operators/flatten_op.h     | 18 ++++--
 paddle/pten/CMakeLists.txt              |  4 +-
 paddle/pten/include/manipulation.h      |  1 +
 paddle/pten/kernels/xpu/CMakeLists.txt  |  2 +
 paddle/pten/kernels/xpu/manipulation.cc | 82 +++++++++++++++++++++++++
 paddle/pten/kernels/xpu/manipulation.h  | 38 ++++++++++++
 paddle/pten/kernels/xpu/utils.cc        | 81 ++++++++++++++++++++++++
 paddle/pten/kernels/xpu/utils.h         | 34 ++++++++++
 9 files changed, 265 insertions(+), 39 deletions(-)
 create mode 100644 paddle/pten/kernels/xpu/manipulation.cc
 create mode 100644 paddle/pten/kernels/xpu/manipulation.h
 create mode 100644 paddle/pten/kernels/xpu/utils.cc
 create mode 100644 paddle/pten/kernels/xpu/utils.h

diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 14f2e9061b..517422af1f 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -79,14 +79,6 @@ class FlattenOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -157,14 +149,6 @@ class FlattenGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -227,14 +211,6 @@ class Flatten2Op : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -285,14 +261,6 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -365,6 +333,18 @@ class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
 
     return out_shape;
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (ctx.HasOutput("XShape")) {
+      return framework::KernelSignature("flatten_contiguous_range.mid", {"X"},
+                                        {"start_axis", "stop_axis"},
+                                        {"Out", "XShape"});
+    } else {
+      return framework::KernelSignature("flatten_contiguous_range", {"X"},
+                                        {"start_axis", "stop_axis"}, {"Out"});
+    }
+  }
 };
 
 class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index efcb0cbe2e..7d08a95821 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/manipulation.h"
 
 namespace paddle {
 namespace operators {
@@ -122,13 +125,16 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto *in = context.Input<framework::LoDTensor>("X");
     auto *out = context.Output<framework::LoDTensor>("Out");
-    auto out_dims = out->dims();
-
     out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
+    auto &start_axis = context.Attr<int>("start_axis");
+    auto &stop_axis = context.Attr<int>("stop_axis");
+    auto &dev_ctx = context.device_context<DeviceContext>();
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    // call new kernel
+    pten::Flatten<T>(dev_ctx, *pt_x.get(), start_axis, stop_axis, pt_out.get());
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 0444fa593c..e72ec1f8ae 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -17,5 +17,7 @@ set(PTEN_DEPS ${PTEN_DEPS} unary binary)
 if(WITH_GPU OR WITH_ROCM)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
 endif()
-
+if(WITH_XPU)
+  set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
+endif()
 cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
index 236f7c7af9..e10f296dbd 100644
--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -19,6 +19,7 @@
 #include "paddle/pten/include/infershape.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/xpu/manipulation.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/xpu/CMakeLists.txt b/paddle/pten/kernels/xpu/CMakeLists.txt
index e69de29bb2..3ba070bdd6 100644
--- a/paddle/pten/kernels/xpu/CMakeLists.txt
+++ b/paddle/pten/kernels/xpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(utils_xpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_xpu unary)
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
new file mode 100644
index 0000000000..379e459a60
--- /dev/null
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -0,0 +1,82 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/xpu/manipulation.h"
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/xpu/utils.h"
+
+namespace pten {
+
+template <typename T>
+void Flatten(const XPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const XPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.dims();
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationXPU);
+
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   XPU,
+                   ANY,
+                   pten::Flatten,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   XPU,
+                   ANY,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.h b/paddle/pten/kernels/xpu/manipulation.h
new file mode 100644
index 0000000000..02947759b4
--- /dev/null
+++ b/paddle/pten/kernels/xpu/manipulation.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using XPUContext = paddle::platform::XPUDeviceContext;
+
+template <typename T>
+void Flatten(const XPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
new file mode 100644
index 0000000000..33bdc66ff0
--- /dev/null
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/xpu/utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pten {
+
+void Copy(const XPUDeviceContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  CHECK(dst->layout() == src.layout());
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
+
+  if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
+      paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else if (paddle::platform::is_cpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else if (paddle::platform::is_xpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsXPU);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/xpu/utils.h b/paddle/pten/kernels/xpu/utils.h
new file mode 100644
index 0000000000..c92812ed68
--- /dev/null
+++ b/paddle/pten/kernels/xpu/utils.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pten {
+
+using XPUDeviceContext = paddle::platform::XPUDeviceContext;
+
+void Copy(const XPUDeviceContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst);
+
+}  // namespace pten
+
+#endif
-- 
GitLab