[PHI Decoupling]Remove memory header (Part2) (#50870)

* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs

[PHI Decoupling]Remove memory header (Part2) (#50870)
* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs
558068cc · YuanRisheng · GitHub · d9fb639c · 558068cc · 558068cc
90 changed file
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
    return Copy(place_dst, dst, src_place, src, num);
  }
 }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -133,7 +133,7 @@ endif()
 cc_library(
  init
  SRCS init.cc
-  DEPS device_context custom_kernel context_pool)
+  DEPS device_context custom_kernel context_pool memcpy)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -55,7 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
@@ -469,6 +469,14 @@ void InitMemoryMethod() {
    memory_method->in_same_stream = paddle::memory::InSameStream;
    memory_method->allocation_deleter =
        paddle::memory::allocation::Allocator::AllocationDeleter;
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_HIP)
+    memory_method->copy_with_stream =
+        paddle::memory::Copy<phi::Place, phi::Place>;
+#endif
+    memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
+    memory_method->device_memory_stat_current_value =
+        paddle::memory::DeviceMemoryStatCurrentValue;
    memory_utils.Init(std::move(memory_method));
  });
 }

--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) {
  MemoryUtils::Instance().AllocationDeleter(allocation);
 }
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num,
+          void* stream) {
+  MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num, stream);
+}
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num) {
+  MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num);
+}
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
+                                                              dev_id);
+}
 }  // namespace memory_utils
 }  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -77,6 +77,42 @@ struct MemoryInterface {
   * @param[Allocation] allocation  the allocation to be freed
   */
  void (*allocation_deleter)(Allocation* allocation);
+  /**
+   * @brief   Copy memory from one place to another place.
+   *
+   * @param[Place]  DstPlace Destination allocation place (CPU or GPU or XPU or
+   * CustomDevice).
+   * @param[void*]  dst      Destination memory address.
+   * @param[Place]  SrcPlace Source allocation place (CPU or GPU or XPU or
+   * CustomDevice).
+   * @param[void*]  src      Source memory address.
+   * @param[size_t]  num      memory size in bytes to copy.
+   * @param[void*]  stream   stream for asynchronously memory copy.
+   *
+   * @note    For GPU/XPU/CustomDevice memory copy, stream need to be specified
+   *          for asynchronously memory copy, and type is restored in the
+   *          implementation.
+   *
+   */
+  void (*copy)(
+      Place dst_place, void* dst, Place src_place, const void* src, size_t num);
+  void (*copy_with_stream)(Place dst_place,
+                           void* dst,
+                           Place src_place,
+                           const void* src,
+                           size_t num,
+                           void* stream);
+  /**
+   * @brief get the device STAT value
+   *
+   * @param[std::string] stat_type  memory's stat type, can be 'Allocated' or
+   * 'Reserved'
+   * @param[int]stream   device id
+   */
+  int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
+                                              int dev_id);
 };
 class MemoryUtils {
@@ -156,6 +192,48 @@ class MemoryUtils {
    return memory_method_->allocation_deleter(allocation);
  }
+  void Copy(const Place& dst_place,
+            void* dst,
+            const Place& src_place,
+            const void* src,
+            size_t num,
+            void* stream) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(memory_method_->copy_with_stream,
+                      nullptr,
+                      phi::errors::Unavailable(
+                          "copy_with_stream method in memory_method_ is not "
+                          "initiazed yet. You need init it first."));
+    memory_method_->copy_with_stream(
+        dst_place, dst, src_place, src, num, stream);
+  }
+  void Copy(const Place& dst_place,
+            void* dst,
+            const Place& src_place,
+            const void* src,
+            size_t num) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->copy,
+        nullptr,
+        phi::errors::Unavailable("copy method in memory_method_ is not "
+                                 "initiazed yet. You need init it first."));
+    memory_method_->copy(dst_place, dst, src_place, src, num);
+  }
+  int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
+                                       int dev_id) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->device_memory_stat_current_value,
+        nullptr,
+        phi::errors::Unavailable(
+            "device_memory_stat_current_value method in memory_method_ is not "
+            "initiazed yet. You need init it first."));
+    return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
+  }
  void CheckMemoryMethod() {
    PADDLE_ENFORCE_NE(
        memory_method_.get(),
@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 void AllocationDeleter(Allocation* allocation);
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num,
+          void* stream);
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 }  // namespace memory_utils
 }  // namespace phi
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>
 #include "glog/logging.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/utils/none.h"
@@ -41,7 +40,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
  auto stream = dev_ctx->stream();
  void *src = (*gpu_)->ptr();
  void *dst = cpu_->data();
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     dst,
                     OptionalCUDAPlace(*gpu_).get(),
                     src,
@@ -64,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
  auto *dev_ctx = static_cast<phi::GPUContext *>(
      phi::DeviceContextPool::Instance().Get(place));
  auto stream = dev_ctx->stream();
-  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
+  memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
                     dst,
                     phi::CPUPlace(),
                     src,

--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/phi/core/selected_rows_impl.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 namespace phi {
 struct ReAllocateVisitor {

--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -16,11 +16,10 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace phi {
@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx,
  if (src_place.GetType() == AllocationType::CPU &&
      dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  } else if ((src_place.GetType() == AllocationType::CPU ||
              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
             (dst_place.GetType() == AllocationType::CPU ||
              dst_place.GetType() == AllocationType::GPUPINNED)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CPU) {
    auto src_gpu_place = src_place;
@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  } else if ((src_place.GetType() == AllocationType::CPU ||
              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
             dst_place.GetType() == AllocationType::GPU) {
@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx,
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    if (src_place.GetType() == dst_place.GetType()) {
-      paddle::memory::Copy(
+      memory_utils::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
    } else {
      if (ctx_place.GetType() == src_place.GetType()) {
-        paddle::memory::Copy(
+        memory_utils::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
      } else if (ctx_place.GetType() == dst_place.GetType()) {
        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
-        paddle::memory::Copy(
+        memory_utils::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
      } else {
        PADDLE_THROW(errors::Unavailable(
@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
 #endif
 #ifdef PADDLE_WITH_XPU
  } else if (src_place.GetType() == AllocationType::XPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else if (src_place.GetType() == AllocationType::CPU &&
             dst_place.GetType() == AllocationType::XPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else if (src_place.GetType() == AllocationType::XPU &&
             dst_place.GetType() == AllocationType::XPU) {
    if (src_ptr == dst_ptr) {
@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx,
              << dst_place;
      return;
    }
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx,
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::CPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
 #endif
  } else {
    PADDLE_THROW(errors::Unimplemented(
@@ -425,12 +424,11 @@ void TensorFromVector(const std::vector<T>& src,
  auto size = src.size() * sizeof(T);
  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place,
-        dst_place,
                       dst_ptr,
                       src_place,
                       src_ptr,
@@ -440,7 +438,7 @@ void TensorFromVector(const std::vector<T>& src,
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_place,
        dst_ptr,
        src_place,
@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src,
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -480,12 +478,11 @@ void TensorFromVector(const std::vector<bool>& src,
  auto size = src.size() * sizeof(bool);
  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place,
-        dst_place,
                       dst_ptr,
                       src_place,
                       src_ptr,
@@ -496,12 +493,12 @@ void TensorFromVector(const std::vector<bool>& src,
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
    auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -573,12 +570,11 @@ void TensorFromArray(const T* src,
  auto size = array_size * sizeof(T);
  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place,
-        dst_place,
                       dst_ptr,
                       src_place,
                       src_ptr,
@@ -588,7 +584,7 @@ void TensorFromArray(const T* src,
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_place,
        dst_ptr,
        src_place,
@@ -599,7 +595,7 @@ void TensorFromArray(const T* src,
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -674,12 +670,11 @@ void TensorToVector(const phi::DenseTensor& src,
  auto dst_ptr = static_cast<void*>(dst->data());
  if (src.place().GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place,
-        dst_place,
                       dst_ptr,
                       src.place(),
                       src_ptr,
@@ -689,13 +684,12 @@ void TensorToVector(const phi::DenseTensor& src,
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
  else {  // NOLINT
@@ -718,12 +712,11 @@ void TensorToVector(const phi::DenseTensor& src,
  auto dst_ptr = static_cast<void*>(array);
  if (src.place().GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place,
-        dst_place,
                       dst_ptr,
                       src.place(),
                       src_ptr,
@@ -733,13 +726,12 @@ void TensorToVector(const phi::DenseTensor& src,
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
  for (unsigned int i = 0; i < src.numel(); i++) {
@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));
-  paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+  memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
 }
 template <>
@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));
-  paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+  memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  for (unsigned int i = 0; i < src.numel(); i++) {
    (*dst)[i] = static_cast<bool>(array[i]);

--- a/paddle/phi/kernels/cpu/index_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_add_kernel.cc
@@ -13,10 +13,9 @@
 // limitations under the License.
 #include "paddle/phi/kernels/index_add_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
-// #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/index_add_impl.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/multiplex_grad_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -43,7 +43,7 @@ void MultiplexGradKernel(const Context& ctx,
  for (auto i = 0; i < rows; i++) {
    size_t k = static_cast<size_t>(index[i]);
    if (ins_grad[k]) {
-      paddle::memory::Copy(ctx.GetPlace(),
+      memory_utils::Copy(ctx.GetPlace(),
                         ins_grad[k]->data<T>() + i * cols,
                         ctx.GetPlace(),
                         out_grad.data<T>() + i * cols,

--- a/paddle/phi/kernels/cpu/multiplex_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/multiplex_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -45,7 +45,7 @@ void MultiplexKernel(const Context& ctx,
                      ins.size(),
                      errors::PreconditionNotMet(
                          "index exceeds the number of candidate tensors."));
-    paddle::memory::Copy(ctx.GetPlace(),
+    memory_utils::Copy(ctx.GetPlace(),
                       out->data<T>() + i * cols,
                       ctx.GetPlace(),
                       ins[k]->data<T>() + i * cols,

--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -22,8 +22,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
-// See Note [ Why still include the fluid headers? ]
+#include "paddle/phi/common/memory_utils.h"
-#include "paddle/fluid/memory/memcpy.h"
 #endif
 namespace phi {
@@ -45,13 +44,13 @@ static int ConvertDataByType(
  T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));
-  paddle::memory::Copy(
+  memory_utils::Copy(
      CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1));
  T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
  for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);
-  paddle::memory::Copy(
+  memory_utils::Copy(
      dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2));
  free(cpu_data);

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -57,7 +57,7 @@ struct ConcatFunctor<phi::CPUContext, T> {
      int64_t col_len = input_cols[j];
      auto input_data = input[j].data<T>();
      for (int64_t k = 0; k < out_rows; ++k) {
-        paddle::memory::Copy(cpu_place,
+        memory_utils::Copy(cpu_place,
                           output_data + k * out_cols + col_idx,
                           cpu_place,
                           input_data + k * col_len,
@@ -114,7 +114,7 @@ struct SplitFunctor<phi::CPUContext, T> {
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
          T* dst_ptr = out_tensor->data<T>() + k * col_len;
-          paddle::memory::Copy(cpu_place,
+          memory_utils::Copy(cpu_place,
                             dst_ptr,
                             cpu_place,
                             src_ptr + col_idx,

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"
 namespace phi {
@@ -105,7 +106,7 @@ struct PointerToPointer {
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
    auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
        pre_alloced_host_ptr, in_num);
-    paddle::memory::Copy(ctx.GetPlace(),
+    memory_utils::Copy(ctx.GetPlace(),
                       (*dev_ins_ptr)->ptr(),
                       phi::CPUPlace(),
                       restored,
@@ -155,7 +156,7 @@ struct PointerToPointerAndCol {
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
    auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
        inputs_col, inputs_col_num);
-    paddle::memory::Copy(ctx.GetPlace(),
+    memory_utils::Copy(ctx.GetPlace(),
                       (*dev_col_ptr)->ptr(),
                       phi::CPUPlace(),
                       restored,
@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
  IndexT* inputs_col = inputs_col_vec.data();
 #ifdef PADDLE_WITH_HIP
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
+  phi::Allocator::AllocationPtr data_alloc =
-      paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
+      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
-      paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
+      phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
 #endif
@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType(
 #ifdef PADDLE_WITH_HIP
  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
+  data_alloc =
-                                        out_num * sizeof(T*));
+      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
+  cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
                                        (out_cols_num) * sizeof(IndexT));
  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
 #endif

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.h
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -19,13 +19,11 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/utils/data_type.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 namespace phi {
 namespace funcs {

--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> {
    auto place = dev_ctx.GetPlace();
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+      memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      paddle::memory::Copy(
+      memory_utils::Copy(
          gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
 #else
      PADDLE_THROW(
@@ -65,13 +65,13 @@ struct StridedMemcpyFunctor<T, 1> {
    auto place = dev_ctx.GetPlace();
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(
+      memory_utils::Copy(
          cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      paddle::memory::Copy(gpu_place,
+      memory_utils::Copy(gpu_place,
                         dst,
                         gpu_place,
                         src,

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -23,8 +23,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/for_range.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  int *out_dims_array_gpu =
      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
-  paddle::memory::Copy(gplace,
+  memory_utils::Copy(gplace,
                     x_strides_array_gpu,
                     cplace,
                     x_strides_array.data(),
                     bytes,
                     ctx.stream());
-  paddle::memory::Copy(gplace,
+  memory_utils::Copy(gplace,
                     y_strides_array_gpu,
                     cplace,
                     y_strides_array.data(),
                     bytes,
                     ctx.stream());
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
  const int out_size = std::accumulate(
@@ -1573,13 +1571,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
    int *x_dims_order_gpu =
        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
-    paddle::memory::Copy(gplace,
+    memory_utils::Copy(gplace,
                       x_strides_order_gpu,
                       cplace,
                       x_strides_order.data(),
                       bytes,
                       ctx.stream());
-    paddle::memory::Copy(gplace,
+    memory_utils::Copy(gplace,
                       x_dims_order_gpu,
                       cplace,
                       x_dims_order.data(),
@@ -1612,13 +1610,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
    int *y_dims_order_gpu =
        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
-    paddle::memory::Copy(gplace,
+    memory_utils::Copy(gplace,
                       y_strides_order_gpu,
                       cplace,
                       y_strides_order.data(),
                       bytes,
                       ctx.stream());
-    paddle::memory::Copy(gplace,
+    memory_utils::Copy(gplace,
                       y_dims_order_gpu,
                       cplace,
                       y_dims_order.data(),

--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
    cpu_buf[rank + i] = out_stride[i];
    cpu_buf[2 * rank + i] = axis[i];
  }
-  paddle::memory::Copy(
+  memory_utils::Copy(
      cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
  REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
  REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> {
      cpu_buf[rank + i] = out_stride[i];
      cpu_buf[2 * rank + i] = axis[i];
    }
-    paddle::memory::Copy(
+    memory_utils::Copy(
        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);

--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -119,7 +119,7 @@ struct TensorSetConstantXPU {
    int numel = tensor_->numel();
    std::unique_ptr<T[]> data_cpu(new T[numel]);
    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
-    paddle::memory::Copy(place_,
+    memory_utils::Copy(place_,
                       begin,
                       phi::CPUPlace(),
                       static_cast<void*>(data_cpu.get()),

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace phi {
@@ -39,7 +39,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
        dev_ctx.GetPlace(),
        a.numel() * sizeof(T),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       tmp_gpu_mat_data->ptr(),
                       dev_ctx.GetPlace(),
                       a.data(),
@@ -62,7 +62,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
      dev_ctx.GetPlace(),
      total_bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     tmp_gpu_ptrs_data->ptr(),
                     phi::CPUPlace(),
                     static_cast<void*>(cpu_ptrs.data()),
@@ -107,7 +107,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                      gpu_info_ptr,
                      batch_size);
  }
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     info.data(),
                     dev_ctx.GetPlace(),
                     gpu_info_ptr,

--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -84,7 +84,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
      context.GetPlace(),
      cpu_ptrs.size() * sizeof(T*),
      phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-  paddle::memory::Copy(context.GetPlace(),
+  memory_utils::Copy(context.GetPlace(),
                     tmp_gpu_ptrs_data->ptr(),
                     phi::CPUPlace(),
                     static_cast<void*>(cpu_ptrs.data()),
@@ -121,7 +121,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
                    batch_size);
  // check whether BatchedGETRF is executed successfully or not
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     info.data(),
                     context.GetPlace(),
                     gpu_info_ptr,

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,9 +25,8 @@ namespace cub = hipcub;
 #endif
 #include <algorithm>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -433,7 +432,7 @@ void SelectKernel(const KPDevice &dev_ctx,
  // 3.1 set temp ptr for in;
  // 3.1 alloc for out
  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
-  paddle::memory::Copy(cpu_place,
+  memory_utils::Copy(cpu_place,
                     &total_true_num,
                     cuda_place,
                     cumsum_data + need_grids,

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -93,14 +93,14 @@ struct SelectedRowsAdd<phi::CPUContext, T> {
    auto* out_data = out_value->data<T>();
    auto* in1_data = in1_value.data<T>();
-    paddle::memory::Copy(out_place,
+    memory_utils::Copy(out_place,
                       out_data,
                       in1_place,
                       in1_data,
                       in1_value.numel() * sizeof(T));
    auto* in2_data = in2_value.data<T>();
-    paddle::memory::Copy(out_place,
+    memory_utils::Copy(out_place,
                       out_data + in1_value.numel(),
                       in2_place,
                       in2_data,
@@ -219,7 +219,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
-    paddle::memory::Copy(in2_place,
+    memory_utils::Copy(in2_place,
                       in2_data + input2_offset,
                       in1_place,
                       in1_data,
@@ -566,7 +566,7 @@ struct MergeAddImpl {
      for (auto* in : inputs) {
        auto* in_data = in->value().data<T>();
        auto in_numel = in->rows().size() * input_width;
-        paddle::memory::Copy(out_place,
+        memory_utils::Copy(out_place,
                           out_data + copied_numel,
                           in_place,
                           in_data,
@@ -680,12 +680,12 @@ struct MergeAdd<phi::XPUContext, T> {
    xpu::ctx_guard RAII_GUARD(context.x_context());
    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
-    paddle::memory::Copy(context.GetPlace(),
+    memory_utils::Copy(context.GetPlace(),
                       y_rows_data,
                       phi::CPUPlace(),
                       merge_rows.data(),
                       ym * sizeof(int64_t));
-    paddle::memory::Copy(context.GetPlace(),
+    memory_utils::Copy(context.GetPlace(),
                       x_rows_data,
                       phi::CPUPlace(),
                       input_rows.data(),
@@ -778,12 +778,12 @@ struct MergeAdd<phi::XPUContext, T> {
      xpu::ctx_guard RAII_GUARD(context.x_context());
      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
-      paddle::memory::Copy(context.GetPlace(),
+      memory_utils::Copy(context.GetPlace(),
                         y_rows_data,
                         phi::CPUPlace(),
                         merge_rows.data(),
                         ym * sizeof(int64_t));
-      paddle::memory::Copy(context.GetPlace(),
+      memory_utils::Copy(context.GetPlace(),
                         x_rows_data,
                         phi::CPUPlace(),
                         input_rows.data(),

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -91,7 +91,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));
-    paddle::memory::Copy(out_place,
+    memory_utils::Copy(out_place,
                       out_data,
                       in1_place,
                       in1_data,
@@ -99,7 +99,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
                       context.stream());
    auto* in2_data = in2_value.data<T>();
-    paddle::memory::Copy(out_place,
+    memory_utils::Copy(out_place,
                       out_data + in1_value.numel(),
                       in2_place,
                       in2_data,
@@ -249,7 +249,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
-    paddle::memory::Copy(in2_place,
+    memory_utils::Copy(in2_place,
                       in2_data + input2_offset,
                       in1_place,
                       in1_data,

--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -104,7 +104,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
  for (int64_t i = 0; i < before; ++i) {
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(cpu_place,
+      memory_utils::Copy(cpu_place,
                         dst + i * dst_after,
                         cpu_place,
                         src + i * src_after,
@@ -113,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
-      paddle::memory::Copy(gpu_place,
+      memory_utils::Copy(gpu_place,
                         dst + i * dst_after,
                         gpu_place,
                         src + i * src_after,
@@ -122,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
 #elif defined(PADDLE_WITH_ASCEND_CL)
      auto& npu_place = place;
      auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
-      paddle::memory::Copy(npu_place,
+      memory_utils::Copy(npu_place,
                         dst + i * dst_after,
                         npu_place,
                         src + i * src_after,
@@ -131,7 +131,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
 #elif defined(PADDLE_WITH_MLU)
      auto& mlu_place = place;
      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
-      paddle::memory::Copy(mlu_place,
+      memory_utils::Copy(mlu_place,
                         dst + i * dst_after,
                         mlu_place,
                         src + i * src_after,

--- a/paddle/phi/kernels/funcs/tensor_to_string.h
+++ b/paddle/phi/kernels/funcs/tensor_to_string.h
@@ -16,9 +16,9 @@
 #include <sstream>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/utils/string/string_helper.h"
@@ -39,7 +39,7 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
    std::vector<CopyT> cpu_x(n);
    auto *dev_ctx = static_cast<phi::GPUContext *>(
        phi::DeviceContextPool::Instance().Get(place));
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       cpu_x.data(),
                       place,
                       x,

--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 #pragma once
-#include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/errors.h"
@@ -191,7 +190,7 @@ static void CheckEighResult(const GPUContext &dev_ctx,
                            const int64_t batch_size,
                            int *info) {
  std::vector<int> error_info(batch_size);
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     error_info.data(),
                     dev_ctx.GetPlace(),
                     info,

--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/add_n_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
@@ -208,7 +207,7 @@ void AddNKernel(const Context &dev_ctx,
      auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
          dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
-      paddle::memory::Copy(dev_ctx.GetPlace(),
+      memory_utils::Copy(dev_ctx.GetPlace(),
                         tmp_sr_in_out_array->ptr(),
                         phi::CPUPlace(),
                         reinterpret_cast<void *>(sr_in_out_data.data()),
@@ -229,7 +228,7 @@ void AddNKernel(const Context &dev_ctx,
    auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
                                                 in_data.size() * sizeof(T *));
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       tmp_in_array->ptr(),
                       phi::CPUPlace(),
                       reinterpret_cast<void *>(in_data.data()),

--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -20,8 +20,6 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
-#include "paddle/fluid/memory/memory.h"
 namespace phi {
 // Utils
@@ -176,7 +174,7 @@ class LazyZeros<phi::GPUContext, T> {
    for (int i = 0; i < xs_size; i++) {
      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
    }
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       d_starts,
                       cpu_place,
                       h_starts,
@@ -197,7 +195,7 @@ class LazyZeros<phi::GPUContext, T> {
    for (size_t i = 0; i < xs_size; ++i) {
      h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
    }
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       d_out_addrs,
                       cpu_place,
                       h_out_addrs,
@@ -306,7 +304,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
    h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
  }
  int64_t total_num = h_starts[xs_size];
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     d_starts,
                     cpu_place,
                     h_starts,
@@ -329,7 +327,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
    h_xs[i] = xs[i]->data<T>();
    h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     d_xs,
                     cpu_place,
                     h_xs,

--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -30,19 +30,19 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
                                      int64_t* old_num_accumulates) {
  auto stream = dev_ctx.stream();
  auto cuda_place = in_old_num_accumulates.place();
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     old_num_accumulates,
                     cuda_place,
                     in_old_num_accumulates.data<int64_t>(),
                     sizeof(int64_t),
                     stream);
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     num_accumulates,
                     cuda_place,
                     in_num_accumulates.data<int64_t>(),
                     sizeof(int64_t),
                     stream);
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     num_updates,
                     cuda_place,
                     in_num_updates.data<int64_t>(),
@@ -68,21 +68,21 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
  auto stream = dev_ctx.stream();
  auto cuda_place = out_old_num_accumulates->place();
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     out_num_accumulates_ptr,
                     phi::CPUPlace(),
                     &num_accumulates,
                     sizeof(int64_t),
                     stream);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     out_old_num_accumulates_ptr,
                     phi::CPUPlace(),
                     &old_num_accumulates,
                     sizeof(int64_t),
                     stream);
-  paddle::memory::Copy(cuda_place,
+  memory_utils::Copy(cuda_place,
                     out_num_updates_ptr,
                     phi::CPUPlace(),
                     &num_updates,

--- a/paddle/phi/kernels/gpu/box_coder_kernel.cu
+++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu
@@ -17,7 +17,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx,
  float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
  auto cplace = phi::CPUPlace();
  const auto gplace = dev_ctx.GetPlace();
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());
  output_box->Resize({row, col, len});

--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -196,7 +195,7 @@ void CholeskyKernel(const Context& dev_ctx,
  std::vector<int> error_info;  // only for checking positive matrix
  error_info.resize(batch_count);
-  paddle::memory::Copy(CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                     error_info.data(),
                     dev_ctx.GetPlace(),
                     info_ptr,

--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;
 #include <iterator>
 #include <random>
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -581,7 +581,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
  T* sampled_local_class_center_ptr =
      dev_ctx.template Alloc<T>(sampled_local_class_center);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     sampled_local_class_center_ptr,
                     dev_ctx.GetPlace(),
                     cub_sort_values_out_ptr,

--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -24,7 +24,6 @@ namespace cub = hipcub;
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
@@ -32,7 +31,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -220,7 +219,7 @@ void DistributeFpnProposalsKernel(
  int start = 0;
  std::vector<int> sub_lod_list_cpu(lod_size * num_level);
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     sub_lod_list_cpu.data(),
                     place,
                     sub_lod_list_data,

--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -17,9 +17,9 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -136,7 +136,7 @@ void EditDistanceKernel(const Context& ctx,
      if (normalized) {
        distance = distance / n;
      }
-      paddle::memory::Copy(ctx.GetPlace(),
+      memory_utils::Copy(ctx.GetPlace(),
                         out_data + num,
                         CPUPlace(),
                         &distance,

--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -14,10 +14,10 @@
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -182,7 +182,7 @@ struct EmbeddingSparseGradCUDAFunctor {
      InputTypeConvert<<<grids, threads, 0, stream>>>(
          ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
    } else {
-      paddle::memory::Copy(gpu_place,
+      memory_utils::Copy(gpu_place,
                         mixv_new_rows.CUDAMutableData(gpu_place),
                         gpu_place,
                         ids_data,
@@ -211,7 +211,7 @@ struct EmbeddingSparseGradCUDAFunctor {
                          "output@Grad's shape = [%s].",
                          d_table_value->dims(),
                          d_output_dims_2d));
-    paddle::memory::Copy(gpu_place,
+    memory_utils::Copy(gpu_place,
                       d_table_data,
                       gpu_place,
                       d_output_data,

--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -80,7 +80,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
    tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
    int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
    const auto gpu_place = ctx.GetPlace();
-    paddle::memory::Copy(gpu_place,
+    memory_utils::Copy(gpu_place,
                       memory_block_cu,
                       CPUPlace(),
                       memory_block.data(),

--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -96,7 +96,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
  tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
  int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
  const auto gpu_place = ctx.GetPlace();
-  paddle::memory::Copy(gpu_place,
+  memory_utils::Copy(gpu_place,
                     memory_block_cu,
                     CPUPlace(),
                     memory_block.data(),

--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -311,7 +311,7 @@ static void NMS(const phi::GPUContext &ctx,
  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
-  paddle::memory::Copy(CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                     mask_host.data(),
                     place,
                     mask_dev,
@@ -335,7 +335,7 @@ static void NMS(const phi::GPUContext &ctx,
  }
  keep_out->Resize(phi::make_ddim({num_to_keep}));
  int *keep = ctx.template Alloc<int>(keep_out);
-  paddle::memory::Copy(place,
+  memory_utils::Copy(place,
                     keep,
                     CPUPlace(),
                     keep_vec.data(),
@@ -401,7 +401,7 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
                                              pixel_offset);
  int keep_num;
  const auto gpu_place = ctx.GetPlace();
-  paddle::memory::Copy(CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                     &keep_num,
                     gpu_place,
                     keep_num_t.data<int>(),
@@ -542,13 +542,13 @@ void GenerateProposalsKernel(const Context &ctx,
    DenseTensor &proposals = box_score_pair.first;
    DenseTensor &nscores = box_score_pair.second;
-    paddle::memory::Copy(place,
+    memory_utils::Copy(place,
                       rpn_rois_data + num_proposals * 4,
                       place,
                       proposals.data<T>(),
                       sizeof(T) * proposals.numel(),
                       ctx.stream());
-    paddle::memory::Copy(place,
+    memory_utils::Copy(place,
                       rpn_roi_probs_data + num_proposals,
                       place,
                       nscores.data<T>(),
@@ -563,7 +563,7 @@ void GenerateProposalsKernel(const Context &ctx,
    rpn_rois_num->Resize(phi::make_ddim({num}));
    ctx.template Alloc<int>(rpn_rois_num);
    int *num_data = rpn_rois_num->data<int>();
-    paddle::memory::Copy(place,
+    memory_utils::Copy(place,
                       num_data,
                       cpu_place,
                       &tmp_num[0],

--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -28,7 +28,6 @@
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"

--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -20,7 +20,6 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -119,7 +118,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                          info,
                                                          gesvdj_params));
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,
@@ -199,7 +198,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                          gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,
@@ -255,7 +254,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
                                                         params));
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,
@@ -310,7 +309,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
                                                         info,
                                                         params));
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,

--- a/paddle/phi/kernels/gpu/mean_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_kernel.cu
@@ -14,7 +14,7 @@
 #include "paddle/phi/kernels/mean_all_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx,
  auto stream = dev_ctx.stream();
  if (rank == 0) {  // scalar
-    paddle::memory::Copy(
+    memory_utils::Copy(
        place, out_data, place, in_data, numel * sizeof(T), stream);
    return;
  }

--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/multiplex_grad_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -47,7 +47,7 @@ void MultiplexGradKernel(const Context& ctx,
  for (auto i = 0; i < rows; i++) {
    size_t k = static_cast<size_t>(index[i]);
    if (ins_grad[k]) {
-      paddle::memory::Copy(ctx.GetPlace(),
+      memory_utils::Copy(ctx.GetPlace(),
                         ins_grad[k]->data<T>() + i * cols,
                         ctx.GetPlace(),
                         out_grad.data<T>() + i * cols,

--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/multiplex_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -50,7 +50,7 @@ void MultiplexKernel(const Context& ctx,
                      ins.size(),
                      errors::PreconditionNotMet(
                          "index exceeds the number of candidate tensors."));
-    paddle::memory::Copy(ctx.GetPlace(),
+    memory_utils::Copy(ctx.GetPlace(),
                       out->data<T>() + i * cols,
                       ctx.GetPlace(),
                       ins[k]->data<T>() + i * cols,

--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/nanmedian_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -180,7 +179,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
        phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
    int64_t* nan_stat_cpu_ptr =
        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       nan_stat_cpu_ptr,
                       dev_ctx.GetPlace(),
                       nan_stat_mem,

--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/nms_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -83,7 +82,7 @@ void NMSKernel(const Context& dev_ctx,
  NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
      boxes.data<T>(), threshold, num_boxes, mask_dev);
  std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     mask_host.data(),
                     dev_ctx.GetPlace(),
                     mask_dev,
@@ -106,7 +105,7 @@ void NMSKernel(const Context& dev_ctx,
  }
  output->Resize(phi::make_ddim({last_box_num}));
  auto* output_data = dev_ctx.template Alloc<int64_t>(output);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     output_data,
                     phi::CPUPlace(),
                     output_host,

--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -15,8 +15,8 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -128,7 +128,7 @@ void PsroiPoolGradKernel(const Context& ctx,
    if (rois_num.get_ptr()) {
      rois_batch_size = rois_num->numel();
      std::vector<int> rois_num_list(rois_batch_size);
-      paddle::memory::Copy(CPUPlace(),
+      memory_utils::Copy(CPUPlace(),
                         rois_num_list.data(),
                         ctx.GetPlace(),
                         rois_num->data<int>(),

--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -150,7 +150,7 @@ void PsroiPoolKernel(const Context& ctx,
                          rois_batch_size,
                          batch_size));
    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                       rois_num_list.data(),
                       ctx.GetPlace(),
                       rois_num_data,

--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -18,9 +18,9 @@
 #include <algorithm>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -139,7 +139,7 @@ void QrKernel(const Context& ctx,
        auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
        auto new_qr_stride = m * m;
        for (int i = 0; i < batch_size; ++i) {
-          paddle::memory::Copy(ctx.GetPlace(),
+          memory_utils::Copy(ctx.GetPlace(),
                             (new_qr_data + i * new_qr_stride),
                             ctx.GetPlace(),
                             (qr_data + i * qr_stride),
@@ -218,7 +218,7 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,
@@ -272,7 +272,7 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,
@@ -328,7 +328,7 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,
@@ -384,7 +384,7 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,

--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -21,7 +21,7 @@
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {

--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -14,8 +14,8 @@
 #pragma once
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -287,7 +287,7 @@ void WeightToTensor(const Place &place,
    const T *in_data = weight_list[i]->data<T>();
    auto in_size = weight_list[i]->numel();
-    paddle::memory::Copy(weight->place(),
+    memory_utils::Copy(weight->place(),
                       weight_data + weight_offset,
                       weight_list[i]->place(),
                       in_data,
@@ -310,7 +310,7 @@ void WeightListToTensor(const Place &place,
  for (size_t i = 0; i < tensor_list.size(); ++i) {
    const T *in_data = tensor_list[i].data<T>();
    auto in_size = tensor_list[i].numel();
-    paddle::memory::Copy(weight_whole->place(),
+    memory_utils::Copy(weight_whole->place(),
                       weight_data + weight_offset,
                       tensor_list[i].place(),
                       in_data,

--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/roi_align_grad_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -195,7 +194,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  if (boxes_num) {
    int boxes_batch_size = boxes_num->numel();
    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(cplace,
+    memory_utils::Copy(cplace,
                       boxes_num_list.data(),
                       gplace,
                       boxes_num->data<int>(),
@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
  int bytes = box_batch_id_list.numel() * sizeof(int);
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
  dev_ctx.template Alloc<T>(dx);

--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/roi_align_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -180,7 +179,7 @@ void RoiAlignKernel(const Context& dev_ctx,
            batch_size));
    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(cplace,
+    memory_utils::Copy(cplace,
                       boxes_num_list.data(),
                       gplace,
                       boxes_num->data<int>(),
@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx,
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
  GPURoiAlignForward<T>
      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,

--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/roi_pool_grad_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -98,7 +97,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
    if (boxes_num) {
      int boxes_batch_size = boxes_num->numel();
      std::vector<int> boxes_num_list(boxes_batch_size);
-      paddle::memory::Copy(phi::CPUPlace(),
+      memory_utils::Copy(phi::CPUPlace(),
                         boxes_num_list.data(),
                         gplace,
                         boxes_num->data<int>(),
@@ -126,7 +125,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
        bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    paddle::memory::Copy(gplace,
+    memory_utils::Copy(gplace,
                       roi_id_data,
                       phi::CPUPlace(),
                       box_batch_id_data,

--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -14,7 +14,6 @@
 #include "paddle/phi/kernels/roi_pool_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -142,7 +141,7 @@ void RoiPoolKernel(const Context& dev_ctx,
            boxes_batch_size,
            batch_size));
    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       boxes_num_list.data(),
                       gplace,
                       boxes_num->data<int>(),
@@ -190,7 +189,7 @@ void RoiPoolKernel(const Context& dev_ctx,
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
-  paddle::memory::Copy(gplace,
+  memory_utils::Copy(gplace,
                     box_id_data,
                     phi::CPUPlace(),
                     box_batch_id_data,

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -90,7 +90,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       norm_cpu_ptr,
                       dev_ctx.GetPlace(),
                       norm,

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       norm_cpu_ptr,
                       dev_ctx.GetPlace(),
                       norm,

--- a/paddle/phi/kernels/gpu/svd_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_grad_kernel.cu
@@ -14,7 +14,7 @@
 #include "paddle/phi/kernels/svd_grad_kernel.h"
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"

--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -17,7 +17,6 @@
 #include "paddle/phi/kernels/svd_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -105,7 +104,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                               gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,
@@ -186,7 +185,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                               gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &error_info,
                       dev_ctx.GetPlace(),
                       info,

--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx,
  const int block = 512;
  int max_threads = ctx.GetMaxPhysicalThreadCount();
-  paddle::memory::AllocationPtr alloc_ptr{nullptr};
+  phi::Allocator::AllocationPtr alloc_ptr{nullptr};
  if (test_mode) {
    mean_data = mean.template data<BatchNormParamType<T>>();

--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -23,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memory.h"
 namespace phi {
 template <typename T, typename Context>
@@ -98,7 +95,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
        cpu_ptrs.size() * sizeof(T*),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       tmp_gpu_ptrs_data->ptr(),
                       paddle::platform::CPUPlace(),
                       static_cast<void*>(cpu_ptrs.data()),

--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -14,9 +14,9 @@
 #include "paddle/phi/kernels/yolo_box_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/yolo_box_util.h"
@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx,
  int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
  const auto gplace = dev_ctx.GetPlace();
  const auto cplace = phi::CPUPlace();
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
  const T* input_data = input->data<T>();

--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
    int device_id = phi::backends::gpu::GetCurrentDeviceId();
    int64_t allocated =
-        paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+        memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
    int64_t reserved =
-        paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
+        memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)

--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -23,7 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 // TODO(xiongkun): remove the header when decouple the memcpy function in phi.
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
 using Tensor = DenseTensor;
@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> {
    const T* data = tensor.data<T>();
    T value;
    const auto gpu_place = dev_ctx.GetPlace();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
    return value;
  }

--- a/paddle/phi/kernels/impl/lstsq_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
@@ -14,7 +14,7 @@
 #pragma once
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"
@@ -153,7 +153,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,
@@ -222,7 +222,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       &info_h,
                       dev_ctx.GetPlace(),
                       info_d,

--- a/paddle/phi/kernels/impl/qr_kernel_impl.h
+++ b/paddle/phi/kernels/impl/qr_kernel_impl.h
@@ -14,7 +14,7 @@
 #pragma once
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"

--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx,
    case 1:
      Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
-      // paddle::memory::Copy use async copy for GPUPinnedPlace
+      // Copy use async copy for GPUPinnedPlace
      dev_ctx.Wait();
      break;

--- a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
@@ -71,7 +71,7 @@ void AdamDenseParamSparseGradKernel(
    if (beta1_pow.dtype() == DataType::FLOAT16) {
      XPUType* beta1_pow_t =
          RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
-      paddle::memory::Copy(param.place(),
+      memory_utils::Copy(param.place(),
                         beta1_pow_t,
                         beta1_pow.place(),
                         beta1_pow.data<T>(),
@@ -82,7 +82,7 @@ void AdamDenseParamSparseGradKernel(
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
    } else {
      beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
-      paddle::memory::Copy(param.place(),
+      memory_utils::Copy(param.place(),
                         beta1_pow_ptr,
                         beta1_pow.place(),
                         beta1_pow.data<T>(),
@@ -103,7 +103,7 @@ void AdamDenseParamSparseGradKernel(
    if (beta2_pow.dtype() == DataType::FLOAT16) {
      XPUType* beta2_pow_t =
          RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
-      paddle::memory::Copy(param.place(),
+      memory_utils::Copy(param.place(),
                         beta2_pow_t,
                         beta2_pow.place(),
                         beta2_pow.data<T>(),
@@ -114,7 +114,7 @@ void AdamDenseParamSparseGradKernel(
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
    } else {
      beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
-      paddle::memory::Copy(param.place(),
+      memory_utils::Copy(param.place(),
                         beta2_pow_ptr,
                         beta2_pow.place(),
                         beta2_pow.data<T>(),
@@ -233,7 +233,7 @@ void AdamDenseParamSparseGradKernel(
    rows[i] = static_cast<int>(merge_rows[i]);
  }
  xpu_wait(dev_ctx.x_context()->xpu_stream);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     xpu_rows,
                     CPUPlace(),
                     rows.data(),

--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -15,11 +15,10 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
-#include "paddle/fluid/memory/memory.h"
 namespace phi {
 template <typename T, typename Context, typename Functor>
@@ -207,7 +206,7 @@ void PowKernel(const Context& dev_ctx,
  T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
  PADDLE_ENFORCE_NOT_NULL(
      factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     static_cast<void*>(factor_data),
                     phi::CPUPlace(),
                     static_cast<void*>(&pow_factor),

--- a/paddle/phi/kernels/xpu/amp_kernel.cc
+++ b/paddle/phi/kernels/xpu/amp_kernel.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -53,7 +53,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  const bool* found_inf_data = found_infinite.data<bool>();
  bool cpu_found_inf_data = false;
  if (found_infinite.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       static_cast<void*>(&cpu_found_inf_data),
                       found_infinite.place(),
                       static_cast<const void*>(found_inf_data),
@@ -93,7 +93,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  int cpu_good_in_data;
  MPDType cpu_pre_loss_scaling_data;
  if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       static_cast<void*>(&cpu_bad_in_data),
                       in_bad_steps.place(),
                       static_cast<const void*>(bad_in_data),
@@ -103,7 +103,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  }
  if (in_good_steps.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       static_cast<void*>(&cpu_good_in_data),
                       in_good_steps.place(),
                       static_cast<const void*>(good_in_data),
@@ -113,7 +113,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  }
  if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       static_cast<void*>(&cpu_pre_loss_scaling_data),
                       prev_loss_scaling.place(),
                       static_cast<const void*>(pre_loss_scaling_data),
@@ -148,17 +148,17 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
    }
  }
  // copy to device
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     bad_out_data,
                     phi::CPUPlace(),
                     &cpu_bad_out_data,
                     sizeof(int));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     good_out_data,
                     phi::CPUPlace(),
                     &cpu_good_out_data,
                     sizeof(int));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     updated_loss_scaling_data,
                     phi::CPUPlace(),
                     &cpu_updated_loss_scaling_data,
@@ -185,7 +185,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
  int nums_inf_nans = 0;
  MPDType cpu_scale_data;
  if (scale.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
+    memory_utils::Copy(phi::CPUPlace(),
                       static_cast<void*>(&cpu_scale_data),
                       scale.place(),
                       static_cast<const void*>(scale_data),
@@ -211,7 +211,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                                inf_nan_count.data<int>(),
                                x->numel());
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
-      paddle::memory::Copy(phi::CPUPlace(),
+      memory_utils::Copy(phi::CPUPlace(),
                         &nums_inf_nans,
                         dev_ctx.GetPlace(),
                         inf_nan_count.data<int>(),
@@ -264,7 +264,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
    }
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     found_inf_data,
                     phi::CPUPlace(),
                     &cpu_found_inf_data,

--- a/paddle/phi/kernels/xpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/xpu/dropout_kernel.cc
@@ -17,8 +17,8 @@
 #include <memory>
 #include <string>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -46,7 +46,7 @@ void DropoutRawKernel(const Context& dev_ctx,
    int seed_data = 0;
    if (seed_tensor.get_ptr() != nullptr) {
      if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) {
-        paddle::memory::Copy(phi::CPUPlace(),
+        memory_utils::Copy(phi::CPUPlace(),
                           &seed_data,
                           seed_tensor->place(),
                           seed_tensor->data<int>(),

--- a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
@@ -99,7 +99,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
    int r = xpu::cast<int32_t, int64_t>(
        ctx.x_context(), input.data<int>(), id_t, input.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-    paddle::memory::Copy(CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                       ids_cpu.data(),
                       input.place(),
                       id_t,
@@ -140,7 +140,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
                        d_table_value->dims(),
                        d_output_dims_2d));
-  paddle::memory::Copy(CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                     d_table_data,
                     xpu_place,
                     d_output_data,

--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -24,7 +24,7 @@
 #include "paddle/phi/core/visit_type.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {

--- a/paddle/phi/kernels/xpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/gaussian_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -48,7 +48,7 @@ void GaussianKernel(const Context& ctx,
  for (int64_t i = 0; i < size; ++i) {
    data_cpu[i] = dist(*engine);
  }
-  paddle::memory::Copy(ctx.GetPlace(),
+  memory_utils::Copy(ctx.GetPlace(),
                     data,
                     phi::CPUPlace(),
                     reinterpret_cast<void*>(data_cpu.get()),

--- a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function_impl.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -37,7 +37,7 @@ static void SortDescending(const XPUContext& dev_ctx,
  scores_slice_cpu.Resize({value.numel()});
  T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu);
-  paddle::memory::Copy(cpu_place,
+  memory_utils::Copy(cpu_place,
                     scores_slice_cpu_data,
                     place,
                     value_data,
@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx,
  index_out->Resize({index_t.numel()});
  int* idx_out = dev_ctx.template Alloc<int>(index_out);
-  paddle::memory::Copy(
+  memory_utils::Copy(
      place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
 }
@@ -180,7 +180,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
  int keep_num;
  const auto xpu_place = dev_ctx.GetPlace();
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     &keep_num,
                     xpu_place,
                     keep_num_t.data<int>(),
@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
    rpn_rois_num->Resize(phi::make_ddim({num}));
    dev_ctx.template Alloc<int>(rpn_rois_num);
    int* num_data = rpn_rois_num->data<int>();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
  }

--- a/paddle/phi/kernels/xpu/lamb_kernel.cc
+++ b/paddle/phi/kernels/xpu/lamb_kernel.cc
@@ -14,10 +14,10 @@
 #include "paddle/phi/kernels/lamb_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -61,7 +61,7 @@ void LambKernel(const Context& dev_ctx,
      cpu_skip_update = *(skip_update->data<bool>());
    } else {
      const bool* skip_update_flag = skip_update->data<bool>();
-      paddle::memory::Copy(phi::CPUPlace(),
+      memory_utils::Copy(phi::CPUPlace(),
                         static_cast<void*>(&cpu_skip_update),
                         dev_ctx.GetPlace(),
                         static_cast<const void*>(skip_update_flag),
@@ -114,7 +114,7 @@ void LambKernel(const Context& dev_ctx,
    int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr),
                       (beta1_pow.numel()) * sizeof(MT));
    PADDLE_ENFORCE_XPU_SUCCESS(r);
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       beta1_pow_xpu_ptr,
                       beta1_pow.place(),
                       beta1_pow.data<MT>(),
@@ -130,7 +130,7 @@ void LambKernel(const Context& dev_ctx,
    int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr),
                       (beta2_pow.numel()) * sizeof(MT));
    PADDLE_ENFORCE_XPU_SUCCESS(r);
-    paddle::memory::Copy(dev_ctx.GetPlace(),
+    memory_utils::Copy(dev_ctx.GetPlace(),
                       beta2_pow_xpu_ptr,
                       beta2_pow.place(),
                       beta2_pow.data<MT>(),
@@ -198,7 +198,7 @@ void LambKernel(const Context& dev_ctx,
  if (beta1_pow.place().GetType() == phi::AllocationType::CPU) {
    // copy beta1_pow_out from xpu to cpu
-    paddle::memory::Copy(beta1_pow.place(),
+    memory_utils::Copy(beta1_pow.place(),
                       dev_ctx.template HostAlloc<MT>(beta1_pow_out),
                       dev_ctx.GetPlace(),
                       beta1_pow_out_ptr,
@@ -209,7 +209,7 @@ void LambKernel(const Context& dev_ctx,
  }
  if (beta2_pow.place().GetType() == phi::AllocationType::CPU) {
    // copy beta2_pow_out from xpu to cpu
-    paddle::memory::Copy(beta2_pow.place(),
+    memory_utils::Copy(beta2_pow.place(),
                       dev_ctx.template HostAlloc<MT>(beta2_pow_out),
                       dev_ctx.GetPlace(),
                       beta2_pow_out_ptr,

--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -49,7 +49,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
      xpu::nonzero_count(
          dev_ctx.x_context(), mask_data, out_size, mask.numel()),
      "nonzero_count ");
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     static_cast<void*>(&out_size_cpu),
                     mask.place(),
                     static_cast<void*>(out_size),

--- a/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc
@@ -14,8 +14,8 @@
 #include "paddle/phi/kernels/mean_all_grad_kernel.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
  const T* dy = OG->data<T>();
  T dy0_value;
  xpu_wait(dev_ctx.x_context()->xpu_stream);
-  paddle::memory::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
+  memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
  float dy0_fp32 = static_cast<float>(dy0_value);
  dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());

--- a/paddle/phi/kernels/xpu/nonzero_kernel.cc
+++ b/paddle/phi/kernels/xpu/nonzero_kernel.cc
@@ -14,9 +14,9 @@
 #include "paddle/phi/kernels/nonzero_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -42,7 +42,7 @@ void NonZeroKernel(const Context& dev_ctx,
          ret,
          XPUAPIErrorMsg[ret]));
-  paddle::memory::Copy(phi::CPUPlace(),
+  memory_utils::Copy(phi::CPUPlace(),
                     static_cast<void*>(&true_num_cpu),
                     dev_ctx.GetPlace(),
                     static_cast<void*>(true_num),

--- a/paddle/phi/kernels/xpu/randint_kernel.cc
+++ b/paddle/phi/kernels/xpu/randint_kernel.cc
@@ -16,8 +16,8 @@
 #include <random>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -47,7 +47,7 @@ void RandintRawKernel(const Context& dev_ctx,
  for (int64_t i = 0; i < numel; ++i) {
    data_cpu[i] = dist(*engine);
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     data,
                     phi::CPUPlace(),
                     reinterpret_cast<void*>(data_cpu.get()),

--- a/paddle/phi/kernels/xpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/xpu/rmsprop_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -48,7 +48,7 @@ void RmspropDenseKernel(const Context& dev_ctx,
                              " But received learning rate dim [%s] ",
                              learning_rate.dims().size()));
  T learning_rate_cpu = 0.0f;
-  paddle::memory::Copy(CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                     static_cast<void*>(&learning_rate_cpu),
                     dev_ctx.GetPlace(),
                     static_cast<const void*>(learning_rate.data()),

--- a/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
@@ -14,9 +14,9 @@
 #include "paddle/phi/kernels/roi_align_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -51,7 +51,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  if (boxes_num) {
    rois_batch_size = boxes_num->numel();
    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(cplace,
+    memory_utils::Copy(cplace,
                       rois_num_list.data(),
                       xplace,
                       boxes_num->data<int>(),
@@ -73,7 +73,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
                     (rois_batch_size + 1) * sizeof(int));
  PADDLE_ENFORCE_XPU_SUCCESS(r);
-  paddle::memory::Copy(xplace,
+  memory_utils::Copy(xplace,
                     roi_id_data,
                     cplace,
                     cpu_lod,

--- a/paddle/phi/kernels/xpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/xpu/roi_align_kernel.cc
@@ -14,9 +14,9 @@
 #include "paddle/phi/kernels/roi_align_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
@@ -62,7 +62,7 @@ void RoiAlignKernel(const Context& dev_ctx,
            batch_size));
    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(cplace,
+    memory_utils::Copy(cplace,
                       rois_num_list.data(),
                       xplace,
                       boxes_num->data<int>(),
@@ -115,7 +115,7 @@ void RoiAlignKernel(const Context& dev_ctx,
  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
                     (rois_batch_size + 1) * sizeof(int));
  PADDLE_ENFORCE_XPU_SUCCESS(r);
-  paddle::memory::Copy(xplace,
+  memory_utils::Copy(xplace,
                     roi_id_data,
                     cplace,
                     cpu_lod,

--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -66,7 +66,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
                               x.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
    int non_zero_cpu = 0;
-    paddle::memory::Copy(CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                       static_cast<void*>(&non_zero_cpu),
                       dev_ctx.GetPlace(),
                       static_cast<void*>(non_zero),

--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -62,7 +62,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
                               x.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
    int non_zero_cpu = 0;
-    paddle::memory::Copy(CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                       static_cast<void*>(&non_zero_cpu),
                       dev_ctx.GetPlace(),
                       static_cast<void*>(non_zero),

--- a/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <limits>
 #include <random>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/truncated_normal.h"
@@ -52,7 +52,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
    data_cpu[i] = truncated_normal(dist(*engine));
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     data,
                     phi::CPUPlace(),
                     reinterpret_cast<void*>(data_cpu.get()),

--- a/paddle/phi/kernels/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/xpu/uniform_kernel.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/uniform_real_distribution.h"
@@ -67,7 +67,7 @@ void UniformRawKernel(const Context &dev_ctx,
    }
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
+  memory_utils::Copy(dev_ctx.GetPlace(),
                     data,
                     phi::CPUPlace(),
                     reinterpret_cast<void *>(data_cpu.get()),

--- a/paddle/phi/tests/common/transform_test.cu
+++ b/paddle/phi/tests/common/transform_test.cu
@@ -16,9 +16,8 @@ limitations under the License. */
 #include "paddle/phi/common/transform.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/hostdevice.h"
 template <typename T>
@@ -37,9 +36,6 @@ class Multiply {
  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };
-using paddle::memory::Alloc;
-using paddle::memory::Copy;
 using paddle::platform::CPUPlace;
 using paddle::platform::CUDAPlace;
 using phi::CPUContext;
@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) {
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
+  auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(float) * 4);
  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
+  phi::memory_utils::Copy(
+      gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
  Transform<phi::GPUContext> trans;
  trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
  ctx->Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
+  phi::memory_utils::Copy(
+      CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
  }
@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) {
  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
-  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
+  auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(buf));
  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
+  phi::memory_utils::Copy(
+      gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
  Transform<phi::GPUContext> trans;
  trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
  ctx->Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
+  phi::memory_utils::Copy(
+      CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
  for (int i = 0; i < 4; ++i) {
    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
  }

--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) {
  auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
  phi::DDim src_stride({5, 1});
@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::funcs::StridedMemcpy<int>(
      *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
  ctx->Wait();
  ASSERT_EQ(1, dst[0]);
@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) {
  auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
  int dst[8];
  auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) {
  phi::funcs::StridedMemcpy<int>(
      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
  ctx->Wait();
  // clang-format off