diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index d0d985874fa5857125f9722bb7695dd5e99e5aa4..f9e60b915176d5b9f5705b898369f1966c1c6504 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -103,7 +103,8 @@ static void SplitTensorsForAllReduce(
   }
   // Sometimes direct copies will be faster
   if (p_dense_tensors->size() < 10) {
-    phi::funcs::StridedMemcpyWithAxis0<T>(context, *in, shape_refer, &outs);
+    phi::funcs::StridedMemcpyWithAxis0<T, DeviceContext>(
+        context, *in, shape_refer, &outs);
   } else {
     operators::math::SplitFunctor<DeviceContext, T> split_functor_;
     split_functor_(context, *in, shape_refer, 0, &outs);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index f930fc32a475d8f668b4f8860b62e3cbc8415d90..607aa5a894cc3e8fa25e4d04c1e912d031dc1dd6 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -727,13 +727,14 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
     for (auto &in : ins) {
       auto in_stride = phi::stride_numel(in.dims());
       auto out_stride = phi::stride_numel(out->dims());
-      phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
-                                              axis,
-                                              out->data<T>() + output_offset,
-                                              out_stride,
-                                              in.data<T>(),
-                                              in_stride,
-                                              in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T, phi::CPUContext>(
+          ctx,
+          axis,
+          out->data<T>() + output_offset,
+          out_stride,
+          in.data<T>(),
+          in_stride,
+          in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index da5415d9e490763f9c3d07b41c6e34cd7c54cbba..58cb1d95666126a104a25d8076343cb60dc4ba0d 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -86,13 +86,14 @@ void ConcatKernel(const Context& dev_ctx,
       }
       auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
-      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
-                                              axis,
-                                              out->data<T>() + output_offset,
-                                              out_stride,
-                                              in->data<T>(),
-                                              in_stride,
-                                              in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T, Context>(
+          dev_ctx,
+          axis,
+          out->data<T>() + output_offset,
+          out_stride,
+          in->data<T>(),
+          in_stride,
+          in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index e8bd17efc7d24f175e297a444c1ecd9a5fb16b6c..d5e727c39fedf301414b7997ec9a9dd0b03109c2 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 801076f46c62d867bd9b0b07d05b8de70f361142..24d9248c3712bd7116b0fcce2d14781740f2740c 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index 76252398b14ed04d1bc0e0ead21a5ba277822fae..547b585312955612ff9b48bb52aed3a658a48bd2 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
 
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+namespace phi {
+class CPUContext;
+}  // namespace phi
+
 namespace phi {
 namespace funcs {
 
@@ -46,6 +49,32 @@ inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
   dst_dim.apply_visitor(func);
 }
 
+template <typename Context>
+inline void CopyWithContext(const Context& ctx,
+                            const Place& dst_place,
+                            void* dst,
+                            const Place& src_place,
+                            const void* src,
+                            size_t num) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+  memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
+#else
+  PADDLE_THROW(
+      phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
+#endif
+}
+
+template <>
+inline void CopyWithContext<phi::CPUContext>(const phi::CPUContext& ctx,
+                                             const Place& dst_place,
+                                             void* dst,
+                                             const Place& src_place,
+                                             const void* src,
+                                             size_t num) {
+  memory_utils::Copy(dst_place, dst, src_place, src, num);
+}
+
 // Strided numel memory copy from src to dst by the specified axis
 //
 // For example, for a tensor dims [4, 20, 100], the strieded numel is
@@ -53,8 +82,8 @@ inline void StridedMemcpy(const phi::DeviceContext& dev_ctx,
 //
 // NOTE: The src and dst tensor should have the same elements
 // except the specified axis.
-template <typename T>
-inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
+template <typename T, typename Context>
+inline void StridedNumelCopyWithAxis(const Context& ctx,
                                      int64_t axis,
                                      T* dst,
                                      const phi::DDim& dst_stride_numel,
@@ -102,52 +131,18 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
   }
 
   for (int64_t i = 0; i < before; ++i) {
-    if (place.GetType() == phi::AllocationType::CPU) {
-      auto& cpu_place = place;
-      memory_utils::Copy(cpu_place,
-                         dst + i * dst_after,
-                         cpu_place,
-                         src + i * src_after,
-                         sizeof(T) * size);
-    } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto& gpu_place = place;
-      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
-      memory_utils::Copy(gpu_place,
-                         dst + i * dst_after,
-                         gpu_place,
-                         src + i * src_after,
-                         sizeof(T) * size,
-                         cuda_ctx.stream());
-#elif defined(PADDLE_WITH_ASCEND_CL)
-      auto& npu_place = place;
-      auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
-      memory_utils::Copy(npu_place,
-                         dst + i * dst_after,
-                         npu_place,
-                         src + i * src_after,
-                         sizeof(T) * size,
-                         npu_ctx.stream());
-#elif defined(PADDLE_WITH_MLU)
-      auto& mlu_place = place;
-      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
-      memory_utils::Copy(mlu_place,
-                         dst + i * dst_after,
-                         mlu_place,
-                         src + i * src_after,
-                         sizeof(T) * size,
-                         mlu_ctx.stream());
-#else
-      PADDLE_THROW(
-          phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
-#endif
-    }
+    CopyWithContext<Context>(ctx,
+                             place,
+                             dst + i * dst_after,
+                             place,
+                             src + i * src_after,
+                             sizeof(T) * size);
   }
 }
 
-template <typename T>
+template <typename T, typename Context>
 inline void StridedMemcpyWithAxis0(
-    const phi::DeviceContext& dev_ctx,
+    const Context& dev_ctx,
     const phi::DenseTensor& input,
     const std::vector<const phi::DenseTensor*>& shape_refer,
     std::vector<phi::DenseTensor*>* outputs) {
@@ -159,13 +154,13 @@ inline void StridedMemcpyWithAxis0(
     auto out_stride = stride_numel(shape_refer[i]->dims());
     auto out = outputs->at(i);
     if (out != nullptr && out->initialized() && out->numel() > 0) {
-      StridedNumelCopyWithAxis<T>(dev_ctx,
-                                  axis,
-                                  out->data<T>(),
-                                  out_stride,
-                                  input.data<T>() + input_offset,
-                                  in_stride,
-                                  out_stride[axis]);
+      StridedNumelCopyWithAxis<T, Context>(dev_ctx,
+                                           axis,
+                                           out->data<T>(),
+                                           out_stride,
+                                           input.data<T>() + input_offset,
+                                           in_stride,
+                                           out_stride[axis]);
     }
     input_offset += out_stride[axis];
   }
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index ac83cb3f829c133de10efa0055705ade2188ef6c..47e5a220e66f69f016300cfd3b3b0873b53ff506 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -85,13 +85,14 @@ void ConcatKernel(const Context& dev_ctx,
       }
       auto in_stride = phi::stride_numel(in->dims());
       auto out_stride = phi::stride_numel(out->dims());
-      phi::funcs::StridedNumelCopyWithAxis<T>(dev_ctx,
-                                              axis,
-                                              out->data<T>() + output_offset,
-                                              out_stride,
-                                              in->data<T>(),
-                                              in_stride,
-                                              in_stride[axis]);
+      phi::funcs::StridedNumelCopyWithAxis<T, Context>(
+          dev_ctx,
+          axis,
+          out->data<T>() + output_offset,
+          out_stride,
+          in->data<T>(),
+          in_stride,
+          in_stride[axis]);
       output_offset += in_stride[axis];
     }
   } else {
diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index b0b0e5728d462e08383ac414e17251efce913a60..a1f0a98463f7a83f9cd6d3dc834e3ed753b32e46 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -57,7 +57,7 @@ void ConcatGradKernel(const Context& dev_ctx,
   if (axis == 0 && outs.size() < 10) {
     std::vector<const DenseTensor*> ref_shape;
     ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
-    phi::funcs::StridedMemcpyWithAxis0<T>(
+    phi::funcs::StridedMemcpyWithAxis0<T, Context>(
         dev_ctx, out_grad, ref_shape, &outputs);
   } else {
     phi::funcs::SplitFunctor<Context, T> split_functor;
diff --git a/paddle/phi/kernels/impl/split_kernel_impl.h b/paddle/phi/kernels/impl/split_kernel_impl.h
index 77acf81cf4c59d53033950102dc376d9543ca25f..83968d913feb4a123394b681b710eaf02d956f94 100644
--- a/paddle/phi/kernels/impl/split_kernel_impl.h
+++ b/paddle/phi/kernels/impl/split_kernel_impl.h
@@ -37,7 +37,8 @@ void SplitKernel(const Context& dev_ctx,
   int axis = axis_scalar.to<int>();
   // Sometimes direct copies will be faster, this maybe need deeply analysis.
   if (axis == 0 && outs.size() < 10) {
-    phi::funcs::StridedMemcpyWithAxis0<T>(dev_ctx, x, shape_refer, &outs);
+    phi::funcs::StridedMemcpyWithAxis0<T, Context>(
+        dev_ctx, x, shape_refer, &outs);
   } else {
     phi::funcs::SplitFunctor<Context, T> functor;
     functor(dev_ctx, x, shape_refer, axis, &outs);