diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 3cc43cdfe643c59beab110dbd9797b8c1b1e4a71..1df77c78a419bb5c99a06a327b3309fdf3c7e6f2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -275,7 +275,7 @@ if(WITH_PYTHON)
   if(NOT ON_INFER)
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node accumulation_node global_utils utils python)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 9605a3b0091a25e30b94909ae71da9a39ef579e3..5e961ce23dbaa144936a338af3beac41b8be0a20 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -25,6 +25,8 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+# keep this message for debug, remove it later if needless
+message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu)
 set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 9d37effb8d7b312f4d4d7f7cd3f21029ac287d2a..27d274a5b34ad51663414b9571a465d72a5c8df7 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -33,15 +33,6 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)
 # auto build kernel targets by cmake
 register_kernels(EXCLUDES flatten_kernel DEPS ${COMMON_KERNEL_DEPS})
 # TODO(chenweihang): auto parse compile deps by include headers later
-set(FLATTEN_DEPS ${COMMON_KERNEL_DEPS} utils_cpu unary)
-if(WITH_GPU OR WITH_ROCM)
-  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_gpu)
-elseif(WITH_XPU)
-  set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_xpu)
-endif()
-kernel_library(flatten_kernel DEPS ${FLATTEN_DEPS})
-
-get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
-message(STATUS "PTEN_KERNELS: ${pten_kernels}")
+kernel_library(flatten_kernel DEPS ${COMMON_KERNEL_DEPS} copy_kernel unary)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cpu/utils.h b/paddle/pten/kernels/copy_kernel.h
similarity index 85%
rename from paddle/pten/kernels/cpu/utils.h
rename to paddle/pten/kernels/copy_kernel.h
index 93730692079e3e6ab6c6fac9578c9b108d66c63f..d095d18a371f0913cafa793fae7392326b39934e 100644
--- a/paddle/pten/kernels/cpu/utils.h
+++ b/paddle/pten/kernels/copy_kernel.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
 
 namespace pten {
 
-void Copy(const CPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst);
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index f45d511602d71a175c5f917cd955e5be93a8f431..f7dabf47eb68f929cb733d572dac2931a44fa366 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,4 +1,3 @@
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
-cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
+cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
similarity index 84%
rename from paddle/pten/kernels/cpu/utils.cc
rename to paddle/pten/kernels/cpu/copy_kernel.cc
index 1ca20df4d92dcbc89008c5befa0c6bfb37c36de5..6a81579eb4f0333c78df169ea82a52cec1a44eed 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -12,15 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/kernels/cpu/utils.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
 // NOTE(chenweihang): blocking is useless in cpu kernel
-void Copy(const CPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -57,4 +63,5 @@ void Copy(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, CPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, CPU, ALL_LAYOUT, pten::Copy<pten::CPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index b413882c86221d3c5c82a58ff41d5b7f4bf14121..3dba89ea19620b4c8ea41e725e03c512062a0e6c 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"
 
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index 025dc8a83294901cc046cd63f1c53635d1167120..0dc593618637f5f2a9feaaf03aab6a11f8390c2c 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -16,10 +16,8 @@
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
-#include "paddle/pten/kernels/gpu/utils.h"
-#include "paddle/pten/kernels/xpu/utils.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 041df126c024c8e20f8794042d3904c281fedf2d..5fdb5359e1fd757311ebc8dbcaa094fd84ff583a 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -1,11 +1,9 @@
 if(WITH_GPU)
   nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
   nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
+  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 elseif(WITH_ROCM)
   hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
   hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
+  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 endif()
diff --git a/paddle/pten/kernels/gpu/utils.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
similarity index 97%
rename from paddle/pten/kernels/gpu/utils.cu
rename to paddle/pten/kernels/gpu/copy_kernel.cu
index 4d080be11e3ed9164ca58408589b3ace6bcb363b..cb9f8054d3b0f4f13251299c4f0f0c921dba4b06 100644
--- a/paddle/pten/kernels/gpu/utils.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -12,15 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
-void Copy(const GPUContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -232,6 +237,8 @@ void Copy(const GPUContext& dev_ctx,
     }
   }
 }
+
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, GPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, GPU, ALL_LAYOUT, pten::Copy<pten::GPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/gpu/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu
index 8c4aa7449a304513fb56131c5f26cad6126fa166..4dbf8e69b450e2ed085f94cc3bfe39dbcd806023 100644
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/gpu/utils.h"
 #include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
 
@@ -85,6 +85,7 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(reshape, GPU, ANY, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape_with_xshape, GPU, ANY, pten::ReshapeWithXShape, ALL_DTYPE) {}
+    reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
+PT_REGISTER_NO_TEMPLATE_KERNEL(
+    reshape_with_xshape, GPU, ALL_LAYOUT, pten::ReshapeWithXShape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/gpu/utils.h b/paddle/pten/kernels/gpu/utils.h
deleted file mode 100644
index 3a455ad70c4dcb2552fa2722a3d9504b64a417a2..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/gpu/utils.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-void Copy(const GPUContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index 8c2213ca9b3ce994f975bfd8d98c2b877535260e..a2faf2a5416942900ee8ab6ec895286a06996d50 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -41,7 +41,7 @@ namespace cub = hipcub;
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
diff --git a/paddle/pten/kernels/xpu/CMakeLists.txt b/paddle/pten/kernels/xpu/CMakeLists.txt
index 3ba070bdd6c96cbb34abc7de6a84d65f7c6cea9f..c6d66b16512785156d4ca846b537d1e9612a0e2f 100644
--- a/paddle/pten/kernels/xpu/CMakeLists.txt
+++ b/paddle/pten/kernels/xpu/CMakeLists.txt
@@ -1,2 +1 @@
-cc_library(utils_xpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_xpu unary)
+cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
similarity index 89%
rename from paddle/pten/kernels/xpu/utils.cc
rename to paddle/pten/kernels/xpu/copy_kernel.cc
index 5ea3a359ef6d69b2fafaa9b76418c770ebfc9de7..479ef50836622614ec7bde9888acdda059266b96 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -12,14 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/kernels/xpu/utils.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
-void Copy(const XPUDeviceContext& dev_ctx,
+template <typename ContextT>
+void Copy(const ContextT& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -76,4 +82,5 @@ void Copy(const XPUDeviceContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, XPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, XPU, ALL_LAYOUT, pten::Copy<pten::XPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
index ecd673015a6770517032218989f9f9947cc2a69f..4d0ed7cb825811f302ee1be399c3f01d9b1f40e2 100644
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/pten/kernels/xpu/manipulation.h"
 #include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/xpu/utils.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/xpu/utils.h b/paddle/pten/kernels/xpu/utils.h
deleted file mode 100644
index 6e34502eb23a503a3c63ecdc45a73dfc86765780..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/xpu/utils.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-namespace pten {
-
-using XPUDeviceContext = paddle::platform::XPUDeviceContext;
-
-void Copy(const XPUDeviceContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index e29fa11d58d1d4816f475ef3e708aa5bcf009586..bef0e2af4cf920ba6f2b90309b13ab6ccc03bdf6 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 9cc994c569553791dc51a4fea97a267d4b73c5c2..3095c83d97c9801c49d073a873fd8db496d63952 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -28,8 +28,7 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 // TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
-// in
-// 'paddle/api',
+// in 'paddle/api'
 TEST(DEV_API, copy) {
   // 1. create tensor
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(