[PHI decoupling] move strided_memcpy.h to phi (#50346)

* decouple strided_memcpy * move strided_memcpy * move strided_memcpy to phi * fix namespace * update * fix gpu compile bugs

[PHI decoupling] move strided_memcpy.h to phi (#50346)
* decouple strided_memcpy * move strided_memcpy * move strided_memcpy to phi * fix namespace * update * fix gpu compile bugs
17318c1a · Huang Jiyi · GitHub · 90650534 · 17318c1a · 17318c1a
18 changed file
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #endif
@@ -103,7 +103,7 @@ static void SplitTensorsForAllReduce(
  }
  // Sometimes direct copies will be faster
  if (p_dense_tensors->size() < 10) {
-    operators::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
+    phi::funcs::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
  } else {
    operators::math::SplitFunctor<DeviceContext, T> split_functor_;
    split_functor_(context, *in, shape_refer, 0, &outs);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -188,7 +188,6 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 if (WITH_GPU)

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {  // Internal

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -24,11 +24,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {
@@ -140,12 +140,12 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
          static_cast<int>(lod[0][i] + offset_data[i]),
          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-      StridedMemcpy<T>(ctx.device_context(),
+      phi::funcs::StridedMemcpy<T>(ctx.device_context(),
-                       in_t.data<T>(),
+                                   in_t.data<T>(),
-                       in_stride,
+                                   in_stride,
-                       in_t.dims(),
+                                   in_t.dims(),
-                       out_stride,
+                                   out_stride,
-                       out->data<T>() + out_offset);
+                                   out->data<T>() + out_offset);
      out_offset += length_data[i] * in_stride[0];
    }
  }
@@ -201,12 +201,12 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
            static_cast<int>(lod[0][i] + offset_data[i]),
            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
-        StridedMemcpy<T>(ctx.device_context(),
+        phi::funcs::StridedMemcpy<T>(ctx.device_context(),
-                         out_grad_t.data<T>(),
+                                     out_grad_t.data<T>(),
-                         out_grad_stride,
+                                     out_grad_stride,
-                         out_grad_t.dims(),
+                                     out_grad_t.dims(),
-                         x_grad_stride,
+                                     x_grad_stride,
-                         x_grad_t.data<T>());
+                                     x_grad_t.data<T>());
      }
    }
  }

--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {
@@ -96,12 +96,13 @@ class SppKernel : public framework::OpKernel<T> {
      out_level.Resize(output_flatten_shape);
      // concat
      auto out_level_stride = phi::stride(out_level.dims());
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+      phi::funcs::StridedMemcpy<T>(
-                       out_level.data<T>(),
+          context.template device_context<DeviceContext>(),
-                       out_level_stride,
+          out_level.data<T>(),
-                       out_level.dims(),
+          out_level_stride,
-                       out_stride,
+          out_level.dims(),
-                       out->data<T>() + output_offset);
+          out_stride,
+          out->data<T>() + output_offset);
      output_offset += out_level.dims()[1] * out_level_stride[1];
    }
  }
@@ -150,19 +151,21 @@ class SppGradKernel : public framework::OpKernel<T> {
      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
      auto flatten_stride = phi::stride(out_level.dims());
      // memcpy
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+      phi::funcs::StridedMemcpy<T>(
-                       out->data<T>() + out_offset,
+          context.template device_context<DeviceContext>(),
-                       out_stride,
+          out->data<T>() + out_offset,
-                       out_level.dims(),
+          out_stride,
-                       flatten_stride,
+          out_level.dims(),
-                       out_level.data<T>());
+          flatten_stride,
+          out_level.data<T>());
-      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+      phi::funcs::StridedMemcpy<T>(
-                       out_grad->data<T>() + out_offset,
+          context.template device_context<DeviceContext>(),
-                       out_stride,
+          out_grad->data<T>() + out_offset,
-                       outgrad_level.dims(),
+          out_stride,
-                       flatten_stride,
+          outgrad_level.dims(),
-                       outgrad_level.data<T>());
+          flatten_stride,
+          outgrad_level.data<T>());
      out_offset += out_level.dims()[1] * out_stride[1];
      // flatten backward to nchw

--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -28,9 +28,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -724,14 +724,13 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
    for (auto &in : ins) {
      auto in_stride = phi::stride_numel(in.dims());
      auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
+      phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
-          ctx,
+                                              axis,
-          axis,
+                                              out->data<T>() + output_offset,
-          out->data<T>() + output_offset,
+                                              out_stride,
-          out_stride,
+                                              in.data<T>(),
-          in.data<T>(),
+                                              in_stride,
-          in_stride,
+                                              in_stride[axis]);
-          in_stride[axis]);
      output_offset += in_stride[axis];
    }
  } else {

--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/concat_kernel.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace phi {
@@ -86,14 +86,13 @@ void ConcatKernel(const Context& dev_ctx,
      }
      auto in_stride = phi::stride_numel(in->dims());
      auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
+      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
-          dev_ctx,
+                                              axis,
-          axis,
+                                              out->data<T>() + output_offset,
-          out->data<T>() + output_offset,
+                                              out_stride,
-          out_stride,
+                                              in->data<T>(),
-          in->data<T>(),
+                                              in_stride,
-          in_stride,
+                                              in_stride[axis]);
-          in_stride[axis]);
      output_offset += in_stride[axis];
    }
  } else {

--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/device_context.h"
-namespace paddle {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-namespace operators {
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+namespace phi {
+namespace funcs {
 namespace detail {
 template <typename T, int Rank>
@@ -26,25 +30,25 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                  const T* src,
                  const int64_t* src_stride,
                  const int64_t* dst_dim,
                  const int64_t* dst_stride,
                  T* dst) const {
    auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+      paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      memory::Copy(
+      paddle::memory::Copy(
          gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
 #else
      PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with GPU."));
+          phi::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
    }
  }
@@ -52,29 +56,30 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                  const T* src,
                  const int64_t* src_stride,
                  const int64_t* dst_dim,
                  const int64_t* dst_stride,
                  T* dst) const {
    auto place = dev_ctx.GetPlace();
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
+      paddle::memory::Copy(
+          cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      memory::Copy(gpu_place,
+      paddle::memory::Copy(gpu_place,
-                   dst,
+                           dst,
-                   gpu_place,
+                           gpu_place,
-                   src,
+                           src,
-                   sizeof(T) * dst_dim[0],
+                           sizeof(T) * dst_dim[0],
-                   cuda_ctx.stream());
+                           cuda_ctx.stream());
 #else
      PADDLE_THROW(
-          platform::errors::Unavailable("Paddle is not compiled with GPU."));
+          phi::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
    }
  }
@@ -82,7 +87,7 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
-  void operator()(const platform::DeviceContext& dev_ctx,
+  void operator()(const phi::DeviceContext& dev_ctx,
                  const T* src,
                  const int64_t* src_stride,
                  const int64_t* dst_dim,
@@ -99,10 +104,10 @@ struct StridedMemcpyFunctor {
 template <typename T>
 struct StridedCopyDimVisitor {
-  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx,
+  StridedCopyDimVisitor(const phi::DeviceContext& dev_ctx,
                        const T* src,
-                        const framework::DDim& src_stride,
+                        const phi::DDim& src_stride,
-                        const framework::DDim& dst_stride,
+                        const phi::DDim& dst_stride,
                        T* dst)
      : dev_ctx_(dev_ctx),
        src_(src),
@@ -111,7 +116,7 @@ struct StridedCopyDimVisitor {
        dst_(dst) {}
  template <int D>
-  void operator()(const framework::Dim<D>& dst_dim) const {
+  void operator()(const phi::Dim<D>& dst_dim) const {
    StridedMemcpyFunctor<T, D> functor;
    functor(dev_ctx_,
            src_,
@@ -121,13 +126,13 @@ struct StridedCopyDimVisitor {
            dst_);
  }
-  const platform::DeviceContext& dev_ctx_;
+  const phi::DeviceContext& dev_ctx_;
  const T* src_;
-  const framework::DDim& src_stride_;
+  const phi::DDim& src_stride_;
-  const framework::DDim& dst_stride_;
+  const phi::DDim& dst_stride_;
  T* dst_;
 };
 }  // namespace detail
-}  // namespace operators
+}  // namespace funcs
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -12,11 +12,13 @@ limitations under the License. */
 #pragma once
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
-#include "paddle/fluid/operators/detail/strided_memcpy.h"
-namespace paddle {
+#include "paddle/fluid/platform/device_context.h"
-namespace operators {
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+namespace funcs {
 // Strided memory copy from src to dst.
 //
@@ -33,13 +35,13 @@ namespace operators {
 // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
 // `dev_ctx.Wait()`.
 template <typename T>
-inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
+inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
                          const T* src,
-                          const framework::DDim& src_stride,
+                          const phi::DDim& src_stride,
-                          const framework::DDim& dst_dim,
+                          const phi::DDim& dst_dim,
-                          const framework::DDim& dst_stride,
+                          const phi::DDim& dst_stride,
                          T* dst) {
-  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+  detail::StridedCopyDimVisitor<T> func(
      dev_ctx, src, src_stride, dst_stride, dst);
  dst_dim.apply_visitor(func);
 }
@@ -52,12 +54,12 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx,
 // NOTE: The src and dst tensor should have the same elements
 // except the specified axis.
 template <typename T>
-inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
+inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
                                     int64_t axis,
                                     T* dst,
-                                     const framework::DDim& dst_stride_numel,
+                                     const phi::DDim& dst_stride_numel,
                                     const T* src,
-                                     const framework::DDim& src_stride_numel,
+                                     const phi::DDim& src_stride_numel,
                                     int64_t size) {
  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
  int64_t src_after = src_stride_numel[axis];
@@ -66,7 +68,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
  PADDLE_ENFORCE_EQ(src_stride_numel.size(),
                    dst_stride_numel.size(),
-                    platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                        "Source and destination tensor should have the same "
                        "dimension size, but source tensor dimension size is "
                        "%u, destination tensor size is %u.",
@@ -78,7 +80,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
      PADDLE_ENFORCE_EQ(
          src_stride_numel[i] / src_stride_numel[axis],
          dst_stride_numel[i] / dst_stride_numel[axis],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Source and destination tensor should have the same number of "
              "elements except the specified axis, but the source elements "
              "number is %d, destination elements number is %d.",
@@ -90,7 +92,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
      PADDLE_ENFORCE_EQ(
          src_stride_numel[i],
          dst_stride_numel[i],
-          platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
              "Source and destination tensor should have the same number of "
              "elements except the specified axis, but the source elements "
              "number is %d, destination elements number is %d.",
@@ -100,44 +102,44 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
  }
  for (int64_t i = 0; i < before; ++i) {
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      memory::Copy(cpu_place,
+      paddle::memory::Copy(cpu_place,
-                   dst + i * dst_after,
+                           dst + i * dst_after,
-                   cpu_place,
+                           cpu_place,
-                   src + i * src_after,
+                           src + i * src_after,
-                   sizeof(T) * size);
+                           sizeof(T) * size);
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
-      memory::Copy(gpu_place,
+      paddle::memory::Copy(gpu_place,
-                   dst + i * dst_after,
+                           dst + i * dst_after,
-                   gpu_place,
+                           gpu_place,
-                   src + i * src_after,
+                           src + i * src_after,
-                   sizeof(T) * size,
+                           sizeof(T) * size,
-                   cuda_ctx.stream());
+                           cuda_ctx.stream());
 #elif defined(PADDLE_WITH_ASCEND_CL)
      auto& npu_place = place;
      auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
-      memory::Copy(npu_place,
+      paddle::memory::Copy(npu_place,
-                   dst + i * dst_after,
+                           dst + i * dst_after,
-                   npu_place,
+                           npu_place,
-                   src + i * src_after,
+                           src + i * src_after,
-                   sizeof(T) * size,
+                           sizeof(T) * size,
-                   npu_ctx.stream());
+                           npu_ctx.stream());
 #elif defined(PADDLE_WITH_MLU)
      auto& mlu_place = place;
      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
-      memory::Copy(mlu_place,
+      paddle::memory::Copy(mlu_place,
-                   dst + i * dst_after,
+                           dst + i * dst_after,
-                   mlu_place,
+                           mlu_place,
-                   src + i * src_after,
+                           src + i * src_after,
-                   sizeof(T) * size,
+                           sizeof(T) * size,
-                   mlu_ctx.stream());
+                           mlu_ctx.stream());
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(
-          "Paddle is not compiled with GPU."));
+          phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
 #endif
    }
  }
@@ -145,11 +147,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
 template <typename T>
 inline void StridedMemcpyWithAxis0(
-    const platform::DeviceContext& dev_ctx,
+    const phi::DeviceContext& dev_ctx,
    const phi::DenseTensor& input,
    const std::vector<const phi::DenseTensor*>& shape_refer,
    std::vector<phi::DenseTensor*>* outputs) {
-  const framework::DDim in_stride = stride_numel(input.dims());
+  const phi::DDim in_stride = stride_numel(input.dims());
  const int axis = 0;
  size_t input_offset = 0;
@@ -169,5 +171,5 @@ inline void StridedMemcpyWithAxis0(
  }
 }
-}  // namespace operators
+}  // namespace funcs
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/concat_kernel.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace phi {
@@ -85,14 +85,13 @@ void ConcatKernel(const Context& dev_ctx,
      }
      auto in_stride = phi::stride_numel(in->dims());
      auto out_stride = phi::stride_numel(out->dims());
-      paddle::operators::StridedNumelCopyWithAxis<T>(
+      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
-          dev_ctx,
+                                              axis,
-          axis,
+                                              out->data<T>() + output_offset,
-          out->data<T>() + output_offset,
+                                              out_stride,
-          out_stride,
+                                              in->data<T>(),
-          in->data<T>(),
+                                              in_stride,
-          in_stride,
+                                              in_stride[axis]);
-          in_stride[axis]);
      output_offset += in_stride[axis];
    }
  } else {

--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 #pragma once
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace phi {
@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx,
  if (axis == 0 && outs.size() < 10) {
    std::vector<const DenseTensor*> ref_shape;
    ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
-    paddle::operators::StridedMemcpyWithAxis0<T>(
+    phi::funcs::StridedMemcpyWithAxis0<T>(
        dev_ctx, out_grad, ref_shape, &outputs);
  } else {
    phi::funcs::SplitFunctor<Context, T> split_functor;

--- a/paddle/phi/kernels/impl/split_kernel_impl.h
+++ b/paddle/phi/kernels/impl/split_kernel_impl.h
@@ -15,11 +15,11 @@
 #pragma once
 #include "paddle/phi/kernels/split_kernel.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 namespace phi {
 template <typename T, typename Context>
@@ -37,8 +37,7 @@ void SplitKernel(const Context& dev_ctx,
  int axis = axis_scalar.to<int>();
  // Sometimes direct copies will be faster, this maybe need deeply analysis.
  if (axis == 0 && outs.size() < 10) {
-    paddle::operators::StridedMemcpyWithAxis0<T>(
+    phi::funcs::StridedMemcpyWithAxis0<T>(dev_ctx, x, shape_refer, &outs);
-        dev_ctx, x, shape_refer, &outs);
  } else {
    phi::funcs::SplitFunctor<Context, T> functor;
    functor(dev_ctx, x, shape_refer, axis, &outs);

--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -95,3 +95,8 @@ cc_test(
  test_cache
  SRCS test_cache.cc
  DEPS gtest cache)
+cc_test(
+  strided_memcpy_test
+  SRCS strided_memcpy_test.cc
+  DEPS device_context memory)
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace tests {
 TEST(StridedMemcpy, CPUCrop) {
  // clang-format off
@@ -29,14 +29,15 @@ TEST(StridedMemcpy, CPUCrop) {
  };
  // clang-format on
-  framework::DDim src_stride({5, 1});
+  phi::DDim src_stride({5, 1});
  int dst[4];
-  framework::DDim dst_dim({2, 2});
+  phi::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
+  phi::DDim dst_stride({2, 1});
  phi::CPUContext ctx;
-  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+  phi::funcs::StridedMemcpy<int>(
+      ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
  ASSERT_EQ(1, dst[0]);
  ASSERT_EQ(2, dst[1]);
@@ -54,13 +55,15 @@ TEST(StridedMemcpy, CPUConcat) {
  int dst[8];
-  framework::DDim src_stride({2, 1});
+  phi::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
+  phi::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
+  phi::DDim dst_stride({4, 1});
  phi::CPUContext ctx;
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
+  phi::funcs::StridedMemcpy<int>(
-  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+      ctx, src, src_stride, dst_dim, dst_stride, dst);
+  phi::funcs::StridedMemcpy<int>(
+      ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
  // clang-format off
  int expect_dst[] = {
@@ -83,8 +86,8 @@ TEST(StridedMemcpy, GPUCrop) {
  };
  // clang-format on
-  platform::CUDAPlace gpu0(0);
+  phi::GPUPlace gpu0(0);
-  platform::CPUPlace cpu;
+  phi::CPUPlace cpu;
  phi::GPUContext ctx(gpu0);
  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -92,24 +95,24 @@ TEST(StridedMemcpy, GPUCrop) {
                       .get());
  ctx.PartialInitWithAllocator();
-  auto src_allocation = memory::Alloc(gpu0, sizeof(src));
+  auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
-  framework::DDim src_stride({5, 1});
+  phi::DDim src_stride({5, 1});
  int dst[4];
-  auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
  int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
-  framework::DDim dst_dim({2, 2});
+  phi::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
+  phi::DDim dst_stride({2, 1});
-  StridedMemcpy<int>(
+  phi::funcs::StridedMemcpy<int>(
      ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
  ctx.Wait();
  ASSERT_EQ(1, dst[0]);
@@ -126,30 +129,31 @@ TEST(StridedMemcpy, GPUConcat) {
  };
  // clang-format on
-  platform::CUDAPlace gpu0(0);
+  phi::GPUPlace gpu0(0);
-  platform::CPUPlace cpu;
+  phi::CPUPlace cpu;
  phi::GPUContext ctx(gpu0);
  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                       .GetAllocator(gpu0, ctx.stream())
                       .get());
  ctx.PartialInitWithAllocator();
-  auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
+  auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
  int dst[8];
-  auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
+  auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
  int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
-  framework::DDim src_stride({2, 1});
+  phi::DDim src_stride({2, 1});
-  framework::DDim dst_dim({2, 2});
+  phi::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({4, 1});
+  phi::DDim dst_stride({4, 1});
-  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  phi::funcs::StridedMemcpy<int>(
-  StridedMemcpy<int>(
+      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  phi::funcs::StridedMemcpy<int>(
      ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
  ctx.Wait();
  // clang-format off
@@ -164,5 +168,5 @@ TEST(StridedMemcpy, GPUConcat) {
 }
 #endif
-}  // namespace operators
+}  // namespace tests
-}  // namespace paddle
+}  // namespace phi