diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index f89fe234c201ab73d0262396b9f6b65313f868a1..d0d985874fa5857125f9722bb7695dd5e99e5aa4 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #endif
@@ -103,7 +103,7 @@ static void SplitTensorsForAllReduce(
   }
   // Sometimes direct copies will be faster
   if (p_dense_tensors->size() < 10) {
-    operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
+    phi::funcs::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
   } else {
     operators::math::SplitFunctor<DeviceContext, T> split_functor_;
     split_functor_(context, *in, shape_refer, 0, &outs);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a287298ef020bae98c110261873ac4d489a1308b..4a64c4411df41649259bc1efc0e4250f5fc77785 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -188,7 +188,6 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index de7d58072382a12eabe7b0740f9bb45841917174..58d978ea9c73a0f069d52ea686aa4b1989f36e17 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -20,10 +20,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index c193eabba372c39bcbe450f3b826dc4cef17a5cd..0c791f01bd923536b1bbc93151a221393ed6bb26 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {  // Internal
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 3f9a55225ca342941a041fa9b9360825c7019b93..38116fc12163ac1ac4e0a5ae054d48daad9276c4 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -24,11 +24,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 050752f23888b2c19b94a9d55e16ad9a50fade63..407b57e3a8281404c46ffec2c4271429427d6b8b 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 1c418d5e0373ab0683440654d92bb4b19c7f2e43..205a743605c297f7a67493335ca98c8819748f43 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
@@ -140,12 +140,12 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
           static_cast<int>(lod[0][i] + offset_data[i]),
           static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
 
-      StridedMemcpy<T>(ctx.device_context(),
-                       in_t.data<T>(),
-                       in_stride,
-                       in_t.dims(),
-                       out_stride,
-                       out->data<T>() + out_offset);
+      phi::funcs::StridedMemcpy<T>(ctx.device_context(),
+                                   in_t.data<T>(),
+                                   in_stride,
+                                   in_t.dims(),
+                                   out_stride,
+                                   out->data<T>() + out_offset);
       out_offset += length_data[i] * in_stride[0];
     }
   }
@@ -201,12 +201,12 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
             static_cast<int>(lod[0][i] + offset_data[i]),
             static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
 
-        StridedMemcpy<T>(ctx.device_context(),
-                         out_grad_t.data<T>(),
-                         out_grad_stride,
-                         out_grad_t.dims(),
-                         x_grad_stride,
-                         x_grad_t.data<T>());
+        phi::funcs::StridedMemcpy<T>(ctx.device_context(),
+                                     out_grad_t.data<T>(),
+                                     out_grad_stride,
+                                     out_grad_t.dims(),
+                                     x_grad_stride,
+                                     x_grad_t.data<T>());
       }
     }
   }
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 260d368dd0ba1bc213ddd8bb7248180bd2fbab9f..0b5c3f91ae1611089096387b85bc343be980fde1 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
@@ -96,12 +96,13 @@ class SppKernel : public framework::OpKernel<T> {
       out_level.Resize(output_flatten_shape);
       // concat
       auto out_level_stride = phi::stride(out_level.dims());
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_level.data<T>(),
-                       out_level_stride,
-                       out_level.dims(),
-                       out_stride,
-                       out->data<T>() + output_offset);
+      phi::funcs::StridedMemcpy<T>(
+          context.template device_context<DeviceContext>(),
+          out_level.data<T>(),
+          out_level_stride,
+          out_level.dims(),
+          out_stride,
+          out->data<T>() + output_offset);
       output_offset += out_level.dims()[1] * out_level_stride[1];
     }
   }
@@ -150,19 +151,21 @@ class SppGradKernel : public framework::OpKernel<T> {
       outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
       auto flatten_stride = phi::stride(out_level.dims());
       // memcpy
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out->data<T>() + out_offset,
-                       out_stride,
-                       out_level.dims(),
-                       flatten_stride,
-                       out_level.data<T>());
+      phi::funcs::StridedMemcpy<T>(
+          context.template device_context<DeviceContext>(),
+          out->data<T>() + out_offset,
+          out_stride,
+          out_level.dims(),
+          flatten_stride,
+          out_level.data<T>());
 
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
-                       out_grad->data<T>() + out_offset,
-                       out_stride,
-                       outgrad_level.dims(),
-                       flatten_stride,
-                       outgrad_level.data<T>());
+      phi::funcs::StridedMemcpy<T>(
+          context.template device_context<DeviceContext>(),
+          out_grad->data<T>() + out_offset,
+          out_stride,
+          outgrad_level.dims(),
+          flatten_stride,
+          outgrad_level.data<T>());
       out_offset += out_level.dims()[1] * out_stride[1];
       // flatten backward to nchw
 
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 082e4584616a1c708d88791cec3f79fa2bc3d60a..51347e459298862d7b4b1c18e5a8c29c7064a79c 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index f0c038226f821e5aef293e0df0373c642e272443..b21d735b589dd1a76c6095c1db9f40f022e323f7 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -28,9 +28,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -724,14 +724,13 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
     for (auto &in : ins) {
       auto in_stride = phi::stride_numel(in.dims());
       auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
-          ctx,
-          axis,
-          out->data<T>() + output_offset,
-          out_stride,
-          in.data<T>(),
-          in_stride,
-          in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
+                                              axis,
+                                              out->data<T>() + output_offset,
+                                              out_stride,
+                                              in.data<T>(),
+                                              in_stride,
+                                              in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 1075cb9f777c38573498a1134124ab6dbcd32fec..da5415d9e490763f9c3d07b41c6e34cd7c54cbba 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/concat_kernel.h"
 
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace phi {
 
@@ -86,14 +86,13 @@ void ConcatKernel(const Context& dev_ctx,
       }
       auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
-          dev_ctx,
-          axis,
-          out->data<T>() + output_offset,
-          out_stride,
-          in->data<T>(),
-          in_stride,
-          in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
+                                              axis,
+                                              out->data<T>() + output_offset,
+                                              out_stride,
+                                              in->data<T>(),
+                                              in_stride,
+                                              in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
similarity index 66%
rename from paddle/fluid/operators/detail/strided_memcpy.h
rename to paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 4c729a65f59ed7c67fbca04f0fd26ecf30f1d93b..57d9765985f090781c45506b38402419634f93b9 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/device_context.h"
 
-namespace paddle {
-namespace operators {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+
+namespace phi {
+namespace funcs {
 namespace detail {
 
 template <typename T, int Rank>
@@ -26,25 +30,25 @@ struct StridedMemcpyFunctor;
 
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                   const T* src,
                   const int64_t* src_stride,
                   const int64_t* dst_dim,
                   const int64_t* dst_stride,
                   T* dst) const {
     auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
       auto& cpu_place = place;
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+      paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      memory::Copy(
+      paddle::memory::Copy(
           gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
 #else
       PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with GPU."));
+          phi::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
     }
   }
@@ -52,29 +56,30 @@ struct StridedMemcpyFunctor<T, 0> {
 
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                   const T* src,
                   const int64_t* src_stride,
                   const int64_t* dst_dim,
                   const int64_t* dst_stride,
                   T* dst) const {
     auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
       auto& cpu_place = place;
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
+      paddle::memory::Copy(
+          cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      memory::Copy(gpu_place,
-                   dst,
-                   gpu_place,
-                   src,
-                   sizeof(T) * dst_dim[0],
-                   cuda_ctx.stream());
+      paddle::memory::Copy(gpu_place,
+                           dst,
+                           gpu_place,
+                           src,
+                           sizeof(T) * dst_dim[0],
+                           cuda_ctx.stream());
 #else
       PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with GPU."));
+          phi::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
     }
   }
@@ -82,7 +87,7 @@ struct StridedMemcpyFunctor<T, 1> {
 
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                   const T* src,
                   const int64_t* src_stride,
                   const int64_t* dst_dim,
@@ -99,10 +104,10 @@ struct StridedMemcpyFunctor {
 
 template <typename T>
 struct StridedCopyDimVisitor {
-  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx,
+  StridedCopyDimVisitor(const phi::DeviceContext& dev_ctx,
                         const T* src,
-                        const framework::DDim& src_stride,
-                        const framework::DDim& dst_stride,
+                        const phi::DDim& src_stride,
+                        const phi::DDim& dst_stride,
                         T* dst)
       : dev_ctx_(dev_ctx),
         src_(src),
@@ -111,7 +116,7 @@ struct StridedCopyDimVisitor {
         dst_(dst) {}
 
   template <int D>
-  void operator()(const framework::Dim<D>& dst_dim) const {
+  void operator()(const phi::Dim<D>& dst_dim) const {
     StridedMemcpyFunctor<T, D> functor;
     functor(dev_ctx_,
             src_,
@@ -121,13 +126,13 @@ struct StridedCopyDimVisitor {
             dst_);
   }
 
-  const platform::DeviceContext& dev_ctx_;
+  const phi::DeviceContext& dev_ctx_;
   const T* src_;
-  const framework::DDim& src_stride_;
-  const framework::DDim& dst_stride_;
+  const phi::DDim& src_stride_;
+  const phi::DDim& dst_stride_;
   T* dst_;
 };
 
 }  // namespace detail
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
similarity index 68%
rename from paddle/fluid/operators/strided_memcpy.h
rename to paddle/phi/kernels/funcs/strided_memcpy.h
index 3a562d2f26e85ca75d76535b6cad3bd62755a41c..cac82faf64a21f4bc86a425c44dad5bb03eac8fb 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -12,11 +12,13 @@ limitations under the License. */
 #pragma once
 #include <vector>
 
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
 
 // Strided memory copy from src to dst.
 //
@@ -33,13 +35,13 @@ namespace operators {
 // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
 // `dev_ctx.Wait()`.
 template <typename T>
-inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
+inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
                           const T* src,
-                          const framework::DDim& src_stride,
-                          const framework::DDim& dst_dim,
-                          const framework::DDim& dst_stride,
+                          const phi::DDim& src_stride,
+                          const phi::DDim& dst_dim,
+                          const phi::DDim& dst_stride,
                           T* dst) {
-  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+  detail::StridedCopyDimVisitor<T> func(
       dev_ctx, src, src_stride, dst_stride, dst);
   dst_dim.apply_visitor(func);
 }
@@ -52,12 +54,12 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
 // NOTE: The src and dst tensor should have the same elements
 // except the specified axis.
 template <typename T>
-inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
+inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
                                      int64_t axis,
                                      T* dst,
-                                     const framework::DDim& dst_stride_numel,
+                                     const phi::DDim& dst_stride_numel,
                                      const T* src,
-                                     const framework::DDim& src_stride_numel,
+                                     const phi::DDim& src_stride_numel,
                                      int64_t size) {
   int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
   int64_t src_after = src_stride_numel[axis];
@@ -66,7 +68,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
 
   PADDLE_ENFORCE_EQ(src_stride_numel.size(),
                     dst_stride_numel.size(),
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Source and destination tensor should have the same "
                         "dimension size, but source tensor dimension size is "
                         "%u, destination tensor size is %u.",
@@ -78,7 +80,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       PADDLE_ENFORCE_EQ(
           src_stride_numel[i] / src_stride_numel[axis],
           dst_stride_numel[i] / dst_stride_numel[axis],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Source and destination tensor should have the same number of "
               "elements except the specified axis, but the source elements "
               "number is %d, destination elements number is %d.",
@@ -90,7 +92,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       PADDLE_ENFORCE_EQ(
           src_stride_numel[i],
           dst_stride_numel[i],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Source and destination tensor should have the same number of "
               "elements except the specified axis, but the source elements "
               "number is %d, destination elements number is %d.",
@@ -100,44 +102,44 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   }
 
   for (int64_t i = 0; i < before; ++i) {
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
       auto& cpu_place = place;
-      memory::Copy(cpu_place,
-                   dst + i * dst_after,
-                   cpu_place,
-                   src + i * src_after,
-                   sizeof(T) * size);
+      paddle::memory::Copy(cpu_place,
+                           dst + i * dst_after,
+                           cpu_place,
+                           src + i * src_after,
+                           sizeof(T) * size);
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
-      memory::Copy(gpu_place,
-                   dst + i * dst_after,
-                   gpu_place,
-                   src + i * src_after,
-                   sizeof(T) * size,
-                   cuda_ctx.stream());
+      paddle::memory::Copy(gpu_place,
+                           dst + i * dst_after,
+                           gpu_place,
+                           src + i * src_after,
+                           sizeof(T) * size,
+                           cuda_ctx.stream());
 #elif defined(PADDLE_WITH_ASCEND_CL)
       auto& npu_place = place;
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
-      memory::Copy(npu_place,
-                   dst + i * dst_after,
-                   npu_place,
-                   src + i * src_after,
-                   sizeof(T) * size,
-                   npu_ctx.stream());
+      paddle::memory::Copy(npu_place,
+                           dst + i * dst_after,
+                           npu_place,
+                           src + i * src_after,
+                           sizeof(T) * size,
+                           npu_ctx.stream());
 #elif defined(PADDLE_WITH_MLU)
       auto& mlu_place = place;
       auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
-      memory::Copy(mlu_place,
-                   dst + i * dst_after,
-                   mlu_place,
-                   src + i * src_after,
-                   sizeof(T) * size,
-                   mlu_ctx.stream());
+      paddle::memory::Copy(mlu_place,
+                           dst + i * dst_after,
+                           mlu_place,
+                           src + i * src_after,
+                           sizeof(T) * size,
+                           mlu_ctx.stream());
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Paddle is not compiled with GPU."));
+      PADDLE_THROW(
+          phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
 #endif
     }
   }
@@ -145,11 +147,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
 
 template <typename T>
 inline void StridedMemcpyWithAxis0(
-    const platform::DeviceContext& dev_ctx,
+    const phi::DeviceContext& dev_ctx,
     const phi::DenseTensor& input,
     const std::vector<const phi::DenseTensor*>& shape_refer,
     std::vector<phi::DenseTensor*>* outputs) {
-  const framework::DDim in_stride = stride_numel(input.dims());
+  const phi::DDim in_stride = stride_numel(input.dims());
   const int axis = 0;
   size_t input_offset = 0;
 
@@ -169,5 +171,5 @@ inline void StridedMemcpyWithAxis0(
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 80ff71b2158241370cde44dc581e7b3f388877fa..ac83cb3f829c133de10efa0055705ade2188ef6c 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/concat_kernel.h"
 
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace phi {
 
@@ -85,14 +85,13 @@ void ConcatKernel(const Context& dev_ctx,
       }
       auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
-          dev_ctx,
-          axis,
-          out->data<T>() + output_offset,
-          out_stride,
-          in->data<T>(),
-          in_stride,
-          in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
+                                              axis,
+                                              out->data<T>() + output_offset,
+                                              out_stride,
+                                              in->data<T>(),
+                                              in_stride,
+                                              in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index 6d169354cb4c303b42d989ec92ee11d981439cb4..b0b0e5728d462e08383ac414e17251efce913a60 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace phi {
 
@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx,
   if (axis == 0 && outs.size() < 10) {
     std::vector<const DenseTensor*> ref_shape;
     ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
-    paddle::operators::StridedMemcpyWithAxis0<T>(
+    phi::funcs::StridedMemcpyWithAxis0<T>(
         dev_ctx, out_grad, ref_shape, &outputs);
   } else {
     phi::funcs::SplitFunctor<Context, T> split_functor;
diff --git a/paddle/phi/kernels/impl/split_kernel_impl.h b/paddle/phi/kernels/impl/split_kernel_impl.h
index 6f43e8ea143f06fe4ed50b041acdedcb959f7001..77acf81cf4c59d53033950102dc376d9543ca25f 100644
--- a/paddle/phi/kernels/impl/split_kernel_impl.h
+++ b/paddle/phi/kernels/impl/split_kernel_impl.h
@@ -15,11 +15,11 @@
 #pragma once
 #include "paddle/phi/kernels/split_kernel.h"
 
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 namespace phi {
 template <typename T, typename Context>
@@ -37,8 +37,7 @@ void SplitKernel(const Context& dev_ctx,
   int axis = axis_scalar.to<int>();
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && outs.size() < 10) {
-    paddle::operators::StridedMemcpyWithAxis0<T>(
-        dev_ctx, x, shape_refer, &outs);
+    phi::funcs::StridedMemcpyWithAxis0<T>(dev_ctx, x, shape_refer, &outs);
   } else {
     phi::funcs::SplitFunctor<Context, T> functor;
     functor(dev_ctx, x, shape_refer, axis, &outs);
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 646c2b36798df04a67c252d4b9def9574a2f0f26..61be79303b094322c387bc265e774d297c9dfc4c 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -95,3 +95,8 @@ cc_test(
   test_cache
   SRCS test_cache.cc
   DEPS gtest cache)
+
+cc_test(
+  strided_memcpy_test
+  SRCS strided_memcpy_test.cc
+  DEPS device_context memory)
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/phi/tests/kernels/strided_memcpy_test.cc
similarity index 63%
rename from paddle/fluid/operators/strided_memcpy_test.cc
rename to paddle/phi/tests/kernels/strided_memcpy_test.cc
index 3d8902a68acfdbe203fa33850f61e9df0b44ff69..7ffc83bb31bb8c55e484fb177b2e660606a3ba20 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace tests {
 
 TEST(StridedMemcpy, CPUCrop) {
   // clang-format off
@@ -29,14 +29,15 @@ TEST(StridedMemcpy, CPUCrop) {
   };
   // clang-format on
 
-  framework::DDim src_stride({5, 1});
+  phi::DDim src_stride({5, 1});
 
   int dst[4];
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
+  phi::DDim dst_dim({2, 2});
+  phi::DDim dst_stride({2, 1});
 
   phi::CPUContext ctx;
-  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+  phi::funcs::StridedMemcpy<int>(
+      ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
 
   ASSERT_EQ(1, dst[0]);
   ASSERT_EQ(2, dst[1]);
@@ -54,13 +55,15 @@ TEST(StridedMemcpy, CPUConcat) {
 
   int dst[8];
 
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
+  phi::DDim src_stride({2, 1});
+  phi::DDim dst_dim({2, 2});
+  phi::DDim dst_stride({4, 1});
   phi::CPUContext ctx;
 
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+  phi::funcs::StridedMemcpy<int>(
+      ctx, src, src_stride, dst_dim, dst_stride, dst);
+  phi::funcs::StridedMemcpy<int>(
+      ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
 
   // clang-format off
   int expect_dst[] = {
@@ -83,8 +86,8 @@ TEST(StridedMemcpy, GPUCrop) {
   };
   // clang-format on
 
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
+  phi::GPUPlace gpu0(0);
+  phi::CPUPlace cpu;
 
   phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -92,24 +95,24 @@ TEST(StridedMemcpy, GPUCrop) {
                        .get());
   ctx.PartialInitWithAllocator();
 
-  auto src_allocation = memory::Alloc(gpu0, sizeof(src));
+  auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
 
   int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
-  framework::DDim src_stride({5, 1});
+  phi::DDim src_stride({5, 1});
 
   int dst[4];
-  auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
   int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
 
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
+  phi::DDim dst_dim({2, 2});
+  phi::DDim dst_stride({2, 1});
 
-  StridedMemcpy<int>(
+  phi::funcs::StridedMemcpy<int>(
       ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
 
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
   ctx.Wait();
 
   ASSERT_EQ(1, dst[0]);
@@ -126,30 +129,31 @@ TEST(StridedMemcpy, GPUConcat) {
   };
   // clang-format on
 
-  platform::CUDAPlace gpu0(0);
-  platform::CPUPlace cpu;
+  phi::GPUPlace gpu0(0);
+  phi::CPUPlace cpu;
   phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(gpu0, ctx.stream())
                        .get());
   ctx.PartialInitWithAllocator();
-  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
+  auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
   int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   int dst[8];
-  auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
   int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
 
-  framework::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
+  phi::DDim src_stride({2, 1});
+  phi::DDim dst_dim({2, 2});
+  phi::DDim dst_stride({4, 1});
 
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
-  StridedMemcpy<int>(
+  phi::funcs::StridedMemcpy<int>(
+      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  phi::funcs::StridedMemcpy<int>(
       ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
 
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
   ctx.Wait();
 
   // clang-format off
@@ -164,5 +168,5 @@ TEST(StridedMemcpy, GPUConcat) {
 }
 
 #endif
-}  // namespace operators
-}  // namespace paddle
+}  // namespace tests
+}  // namespace phi